diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2014-11-24 17:02:24 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2014-11-24 17:02:24 +0000 |
commit | 91bc56ed825ba56b3cc264aa5c95ab84f86832ab (patch) | |
tree | 4df130b28021d86e13bf4565ef58c1c5a5e093b4 /contrib/llvm/lib/Target/NVPTX | |
parent | 9efc7e72bb1daf5d6019871d9c93a1c488a11229 (diff) | |
parent | 5ca98fd98791947eba83a1ed3f2c8191ef7afa6c (diff) |
Merge llvm 3.5.0 release from ^/vendor/llvm/dist, resolve conflicts, and
preserve our customizations, where necessary.
Notes
Notes:
svn path=/projects/clang350-import/; revision=274968
Diffstat (limited to 'contrib/llvm/lib/Target/NVPTX')
47 files changed, 12664 insertions, 1206 deletions
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp index d5be0e42ab0c..80b2f621fb94 100644 --- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp @@ -11,20 +11,21 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "asm-printer" #include "InstPrinter/NVPTXInstPrinter.h" -#include "NVPTX.h" #include "MCTargetDesc/NVPTXBaseInfo.h" +#include "NVPTX.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include <cctype> using namespace llvm; +#define DEBUG_TYPE "asm-printer" + #include "NVPTXGenAsmWriter.inc" @@ -56,13 +57,13 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << "%r"; break; case 4: - OS << "%rl"; + OS << "%rd"; break; case 5: OS << "%f"; break; case 6: - OS << "%fl"; + OS << "%fd"; break; } diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h index 93029ae8d39e..1fb3c57390c2 100644 --- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h @@ -27,8 +27,8 @@ public: NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); - virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; - virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override; // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); @@ -37,15 +37,15 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = 0); + raw_ostream &O, const char *Modifier = nullptr); void printMemOperand(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = 0); + raw_ostream &O, const char *Modifier = nullptr); void printProtoIdent(const MCInst *MI, int OpNum, - raw_ostream &O, const char *Modifier = 0); + raw_ostream &O, const char *Modifier = nullptr); }; } diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index edf4a80b11e6..16ec19c25f16 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -43,14 +43,16 @@ enum PropertyAnnotation { PROPERTY_ISSAMPLER, PROPERTY_ISREADONLY_IMAGE_PARAM, PROPERTY_ISWRITEONLY_IMAGE_PARAM, + PROPERTY_ISREADWRITE_IMAGE_PARAM, PROPERTY_ISKERNEL_FUNCTION, PROPERTY_ALIGN, + PROPERTY_MANAGED, // last property PROPERTY_LAST }; -const unsigned AnnotationNameLen = 8; // length of each annotation name +const unsigned AnnotationNameLen = 9; // length of each annotation name const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = { "maxntidx", // PROPERTY_MAXNTID_X "maxntidy", // PROPERTY_MAXNTID_Y @@ -64,8 +66,10 @@ const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = { "sampler", // PROPERTY_ISSAMPLER "rdoimage", // PROPERTY_ISREADONLY_IMAGE_PARAM "wroimage", // PROPERTY_ISWRITEONLY_IMAGE_PARAM + "rdwrimage", // PROPERTY_ISREADWRITE_IMAGE_PARAM "kernel", // PROPERTY_ISKERNEL_FUNCTION "align", // PROPERTY_ALIGN + "managed", // PROPERTY_MANAGED // last property "proplast", // PROPERTY_LAST @@ -80,6 +84,17 @@ __attribute__((unused)) #endif static const char *NamedMDForAnnotations = "nvvm.annotations"; +namespace NVPTXII { +enum { + // These must be kept in sync with TSFlags in NVPTXInstrFormats.td + IsTexFlag = 0x80, + IsSuldMask = 0x300, + IsSuldShift = 8, + IsSustFlag = 0x400, + IsSurfTexQueryFlag = 0x800, + IsTexModeUnifiedFlag = 0x1000 +}; +} } #endif diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index f2784b836b70..366341afe1b8 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -33,8 +33,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) { CommentString = "//"; - PrivateGlobalPrefix = "$L__"; - HasSetDirective = false; HasSingleParameterDotFile = false; @@ -49,7 +47,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) { Data16bitsDirective = " .b16 "; Data32bitsDirective = " .b32 "; Data64bitsDirective = " .b64 "; - PrivateGlobalPrefix = ""; ZeroDirective = " .b8"; AsciiDirective = " .b8"; AscizDirective = " .b8"; diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index 871bac94f84c..158ca901e586 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -12,14 +12,16 @@ //===----------------------------------------------------------------------===// #include "NVPTXMCTargetDesc.h" -#include "NVPTXMCAsmInfo.h" #include "InstPrinter/NVPTXInstPrinter.h" +#include "NVPTXMCAsmInfo.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/TargetRegistry.h" +using namespace llvm; + #define GET_INSTRINFO_MC_DESC #include "NVPTXGenInstrInfo.inc" @@ -29,8 +31,6 @@ #define GET_REGINFO_MC_DESC #include "NVPTXGenRegisterInfo.inc" -using namespace llvm; - static MCInstrInfo *createNVPTXMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitNVPTXMCInstrInfo(X); @@ -66,7 +66,7 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T, const MCSubtargetInfo &STI) { if (SyntaxVariant == 0) return new NVPTXInstPrinter(MAI, MII, MRI, STI); - return 0; + return nullptr; } // Force static initialization. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h index 490b49d9ead6..e74c808f8554 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h @@ -61,10 +61,14 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) { FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); +ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); +FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); ModulePass *createNVVMReflectPass(); ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping); MachineFunctionPass *createNVPTXPrologEpilogPass(); +MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); +FunctionPass *createNVPTXImageOptimizerPass(); bool isImageOrSamplerVal(const Value *, const Module *); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td index 6183a755c320..93fabf615369 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td @@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30", "Target SM 3.0">; def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35", "Target SM 3.5">; +def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50", + "Target SM 5.0">; // PTX Versions def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30", "Use PTX version 3.0">; def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31", "Use PTX version 3.1">; +def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", + "Use PTX version 3.2">; +def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40", + "Use PTX version 4.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -52,17 +58,12 @@ def : Proc<"sm_20", [SM20]>; def : Proc<"sm_21", [SM21]>; def : Proc<"sm_30", [SM30]>; def : Proc<"sm_35", [SM35]>; +def : Proc<"sm_50", [SM50]>; def NVPTXInstrInfo : InstrInfo { } -def NVPTXAsmWriter : AsmWriter { - bit isMCAsmWriter = 1; - string AsmWriterClassName = "InstPrinter"; -} - def NVPTX : Target { let InstructionSet = NVPTXInstrInfo; - let AssemblyWriters = [NVPTXAsmWriter]; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h index 19d73c5783cb..5b610687e391 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h @@ -30,16 +30,17 @@ public: static char ID; // Pass ID NVPTXAllocaHoisting() : FunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DataLayout>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DataLayoutPass>(); + AU.addPreserved("stack-protector"); AU.addPreserved<MachineFunctionAnalysis>(); } - virtual const char *getPassName() const { + const char *getPassName() const override { return "NVPTX specific alloca hoisting"; } - virtual bool runOnFunction(Function &function); + bool runOnFunction(Function &function) override; }; extern FunctionPass *createAllocaHoisting(); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 7552fe704115..187b88c1d54a 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -13,26 +13,27 @@ //===----------------------------------------------------------------------===// #include "NVPTXAsmPrinter.h" +#include "InstPrinter/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXMCAsmInfo.h" #include "NVPTX.h" #include "NVPTXInstrInfo.h" +#include "NVPTXMachineFunctionInfo.h" #include "NVPTXMCExpr.h" #include "NVPTXRegisterInfo.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" -#include "InstPrinter/NVPTXInstPrinter.h" #include "cl_common_defines.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Assembly/Writer.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/DebugInfo.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCStreamer.h" @@ -43,7 +44,6 @@ #include "llvm/Support/Path.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TimeValue.h" -#include "llvm/Target/Mangler.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include <sstream> using namespace llvm; @@ -132,7 +132,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) { return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx); const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV); - if (CE == 0) + if (!CE) llvm_unreachable("Unknown constant value to lower!"); switch (CE->getOpcode()) { @@ -149,10 +149,25 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) { std::string S; raw_string_ostream OS(S); OS << "Unsupported expression in static initializer: "; - WriteAsOperand(OS, CE, /*PrintType=*/ false, - !AP.MF ? 0 : AP.MF->getFunction()->getParent()); + CE->printAsOperand(OS, /*PrintType=*/ false, + !AP.MF ? nullptr : AP.MF->getFunction()->getParent()); report_fatal_error(OS.str()); } + case Instruction::AddrSpaceCast: { + // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be + // handled by the generic() logic in the MCExpr printer + PointerType *DstTy = cast<PointerType>(CE->getType()); + PointerType *SrcTy = cast<PointerType>(CE->getOperand(0)->getType()); + if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) { + return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP); + } + std::string S; + raw_string_ostream OS(S); + OS << "Unsupported expression in static initializer: "; + CE->printAsOperand(OS, /*PrintType=*/ false, + !AP.MF ? nullptr : AP.MF->getFunction()->getParent()); + report_fatal_error(OS.str()); + } case Instruction::GetElementPtr: { const DataLayout &TD = *AP.TM.getDataLayout(); // Generate a symbolic expression for the byte address @@ -308,16 +323,80 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst Inst; lowerToMCInst(MI, Inst); - OutStreamer.EmitInstruction(Inst); + EmitToStreamer(OutStreamer, Inst); +} + +// Handle symbol backtracking for targets that do not support image handles +bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI, + unsigned OpNo, MCOperand &MCOp) { + const MachineOperand &MO = MI->getOperand(OpNo); + const MCInstrDesc &MCID = MI->getDesc(); + + if (MCID.TSFlags & NVPTXII::IsTexFlag) { + // This is a texture fetch, so operand 4 is a texref and operand 5 is + // a samplerref + if (OpNo == 4 && MO.isImm()) { + lowerImageHandleSymbol(MO.getImm(), MCOp); + return true; + } + if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) { + lowerImageHandleSymbol(MO.getImm(), MCOp); + return true; + } + + return false; + } else if (MCID.TSFlags & NVPTXII::IsSuldMask) { + unsigned VecSize = + 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1); + + // For a surface load of vector size N, the Nth operand will be the surfref + if (OpNo == VecSize && MO.isImm()) { + lowerImageHandleSymbol(MO.getImm(), MCOp); + return true; + } + + return false; + } else if (MCID.TSFlags & NVPTXII::IsSustFlag) { + // This is a surface store, so operand 0 is a surfref + if (OpNo == 0 && MO.isImm()) { + lowerImageHandleSymbol(MO.getImm(), MCOp); + return true; + } + + return false; + } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) { + // This is a query, so operand 1 is a surfref/texref + if (OpNo == 1 && MO.isImm()) { + lowerImageHandleSymbol(MO.getImm(), MCOp); + return true; + } + + return false; + } + + return false; +} + +void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) { + // Ewwww + TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget()); + NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM); + const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>(); + const char *Sym = MFI->getImageHandleSymbol(Index); + std::string *SymNamePtr = + nvTM.getManagedStrPool()->getManagedString(Sym); + MCOp = GetSymbolRef(OutContext.GetOrCreateSymbol( + StringRef(SymNamePtr->c_str()))); } void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { OutMI.setOpcode(MI->getOpcode()); + const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>(); // Special: Do not mangle symbol operand of CALL_PROTOTYPE if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) { const MachineOperand &MO = MI->getOperand(0); - OutMI.addOperand(GetSymbolRef(MO, + OutMI.addOperand(GetSymbolRef( OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName())))); return; } @@ -326,6 +405,13 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { const MachineOperand &MO = MI->getOperand(i); MCOperand MCOp; + if (!ST.hasImageHandles()) { + if (lowerImageHandleOperand(MI, i, MCOp)) { + OutMI.addOperand(MCOp); + continue; + } + } + if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } @@ -346,10 +432,10 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO, MO.getMBB()->getSymbol(), OutContext)); break; case MachineOperand::MO_ExternalSymbol: - MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName())); + MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName())); break; case MachineOperand::MO_GlobalAddress: - MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal())); + MCOp = GetSymbolRef(getSymbol(MO.getGlobal())); break; case MachineOperand::MO_FPImmediate: { const ConstantFP *Cnt = MO.getFPImm(); @@ -408,8 +494,7 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { } } -MCOperand NVPTXAsmPrinter::GetSymbolRef(const MachineOperand &MO, - const MCSymbol *Symbol) { +MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) { const MCExpr *Expr; Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext); @@ -430,7 +515,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { O << " ("; if (isABI) { - if (Ty->isPrimitiveType() || Ty->isIntegerTy()) { + if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) { unsigned size = 0; if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) { size = ITy->getBitWidth(); @@ -447,23 +532,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { << " func_retval0"; } else { if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, Ty, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } + unsigned totalsz = TD->getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!llvm::getAlign(*F, 0, retAlignment)) retAlignment = TD->getABITypeAlignment(Ty); @@ -700,12 +769,11 @@ static bool usedInGlobalVarDef(const Constant *C) { return true; } - for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end(); - ui != ue; ++ui) { - const Constant *C = dyn_cast<Constant>(*ui); - if (usedInGlobalVarDef(C)) - return true; - } + for (const User *U : C->users()) + if (const Constant *C = dyn_cast<Constant>(U)) + if (usedInGlobalVarDef(C)) + return true; + return false; } @@ -731,11 +799,10 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) { (md->getName().str() == "llvm.dbg.sp"))) return true; - for (User::const_use_iterator ui = U->use_begin(), ue = U->use_end(); - ui != ue; ++ui) { - if (usedInOneFunc(*ui, oneFunc) == false) + for (const User *UU : U->users()) + if (usedInOneFunc(UU, oneFunc) == false) return false; - } + return true; } @@ -753,7 +820,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) return false; - const Function *oneFunc = 0; + const Function *oneFunc = nullptr; bool flag = usedInOneFunc(gv, oneFunc); if (flag == false) @@ -766,12 +833,11 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { static bool useFuncSeen(const Constant *C, llvm::DenseMap<const Function *, bool> &seenMap) { - for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end(); - ui != ue; ++ui) { - if (const Constant *cu = dyn_cast<Constant>(*ui)) { + for (const User *U : C->users()) { + if (const Constant *cu = dyn_cast<Constant>(U)) { if (useFuncSeen(cu, seenMap)) return true; - } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) { + } else if (const Instruction *I = dyn_cast<Instruction>(U)) { const BasicBlock *bb = I->getParent(); if (!bb) continue; @@ -798,10 +864,8 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { emitDeclaration(F, O); continue; } - for (Value::const_use_iterator iter = F->use_begin(), - iterEnd = F->use_end(); - iter != iterEnd; ++iter) { - if (const Constant *C = dyn_cast<Constant>(*iter)) { + for (const User *U : F->users()) { + if (const Constant *C = dyn_cast<Constant>(U)) { if (usedInGlobalVarDef(C)) { // The use is in the initialization of a global variable // that is a function pointer, so print a declaration @@ -817,9 +881,9 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { } } - if (!isa<Instruction>(*iter)) + if (!isa<Instruction>(U)) continue; - const Instruction *instr = cast<Instruction>(*iter); + const Instruction *instr = cast<Instruction>(U); const BasicBlock *bb = instr->getParent(); if (!bb) continue; @@ -844,10 +908,7 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) { DbgFinder.processModule(M); unsigned i = 1; - for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), - E = DbgFinder.compile_unit_end(); - I != E; ++I) { - DICompileUnit DIUnit(*I); + for (DICompileUnit DIUnit : DbgFinder.compile_units()) { StringRef Filename(DIUnit.getFilename()); StringRef Dirname(DIUnit.getDirectory()); SmallString<128> FullPathName = Dirname; @@ -862,10 +923,7 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) { ++i; } - for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(), - E = DbgFinder.subprogram_end(); - I != E; ++I) { - DISubprogram SP(*I); + for (DISubprogram SP : DbgFinder.subprograms()) { StringRef Filename(SP.getFilename()); StringRef Dirname(SP.getDirectory()); SmallString<128> FullPathName = Dirname; @@ -895,7 +953,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { const_cast<TargetLoweringObjectFile &>(getObjFileLowering()) .Initialize(OutContext, TM); - Mang = new Mangler(&TM); + Mang = new Mangler(TM.getDataLayout()); // Emit header before any dwarf directives are emitted below. emitHeader(M, OS1); @@ -1022,6 +1080,8 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { for (i = 0; i < n; i++) global_list.insert(global_list.end(), gv_array[i]); + clearAnnotationCache(&M); + delete[] gv_array; return ret; @@ -1043,6 +1103,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { // external global variable with init -> .visible // external without init -> .extern // appending -> not allowed, assert. +// for any linkage other than +// internal, private, linker_private, +// linker_private_weak, linker_private_weak_def_auto, +// we emit -> .weak. void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, raw_ostream &O) { @@ -1068,6 +1132,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, msg.append(V->getName().str()); msg.append("has unsupported appending linkage type"); llvm_unreachable(msg.c_str()); + } else if (!V->hasInternalLinkage() && + !V->hasPrivateLinkage()) { + O << ".weak "; } } } @@ -1078,10 +1145,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Skip meta data if (GVar->hasSection()) { - if (GVar->getSection() == "llvm.metadata") + if (GVar->getSection() == StringRef("llvm.metadata")) return; } + // Skip LLVM intrinsic global variables + if (GVar->getName().startswith("llvm.") || + GVar->getName().startswith("nvvm.")) + return; + const DataLayout *TD = TM.getDataLayout(); // GlobalVariables are always constant pointers themselves. @@ -1093,6 +1165,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".visible "; else O << ".extern "; + } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() || + GVar->hasAvailableExternallyLinkage() || + GVar->hasCommonLinkage()) { + O << ".weak "; } if (llvm::isTexture(*GVar)) { @@ -1117,10 +1193,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, if (llvm::isSampler(*GVar)) { O << ".global .samplerref " << llvm::getSamplerName(*GVar); - const Constant *Initializer = NULL; + const Constant *Initializer = nullptr; if (GVar->hasInitializer()) Initializer = GVar->getInitializer(); - const ConstantInt *CI = NULL; + const ConstantInt *CI = nullptr; if (Initializer) CI = dyn_cast<ConstantInt>(Initializer); if (CI) { @@ -1160,7 +1236,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "linear"; break; case 2: - assert(0 && "Anisotropic filtering is not supported"); + llvm_unreachable("Anisotropic filtering is not supported"); default: O << "nearest"; break; @@ -1187,7 +1263,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, return; } - const Function *demotedFunc = 0; + const Function *demotedFunc = nullptr; if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) { O << "// " << GVar->getName().str() << " has been demoted\n"; if (localDecls.find(demotedFunc) != localDecls.end()) @@ -1202,12 +1278,17 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); + + if (isManaged(*GVar)) { + O << " .attribute(.managed)"; + } + if (GVar->getAlignment() == 0) O << " .align " << (int) TD->getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); - if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { + if (ETy->isSingleValueType()) { O << " ."; // Special case: ABI requires that we use .u8 for predicates if (ETy->isIntegerTy(1)) @@ -1219,13 +1300,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. - if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && - GVar->hasInitializer()) { - const Constant *Initializer = GVar->getInitializer(); - if (!Initializer->isNullValue()) { - O << " = "; - printScalarConstant(Initializer, O); + if (GVar->hasInitializer()) { + if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) { + const Constant *Initializer = GVar->getInitializer(); + // 'undef' is treated as there is no value spefied. + if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) { + O << " = "; + printScalarConstant(Initializer, O); + } + } else { + // The frontend adds zero-initializer to variables that don't have an + // initial value, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue()) { + std::string warnMsg = "initial value of '" + GVar->getName().str() + + "' is not allowed in addrspace(" + + llvm::utostr_32(PTy->getAddressSpace()) + ")"; + report_fatal_error(warnMsg.c_str()); + } } } } else { @@ -1284,7 +1376,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } } @@ -1359,7 +1451,7 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const { return "u32"; } llvm_unreachable("unexpected type"); - return NULL; + return nullptr; } void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, @@ -1378,7 +1470,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, else O << " .align " << GVar->getAlignment(); - if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { + if (ETy->isSingleValueType()) { O << " ."; O << getPTXFundamentalTypeStr(ETy); O << " "; @@ -1404,13 +1496,13 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, O << "]"; break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } return; } static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) { - if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty)) + if (Ty->isSingleValueType()) return TD->getPrefTypeAlignment(Ty); const ArrayType *ATy = dyn_cast<ArrayType>(Ty); @@ -1507,24 +1599,38 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { first = false; // Handle image/sampler parameters - if (llvm::isSampler(*I) || llvm::isImage(*I)) { - if (llvm::isImage(*I)) { - std::string sname = I->getName(); - if (llvm::isImageWriteOnly(*I)) - O << "\t.param .surfref " << *getSymbol(F) << "_param_" - << paramIndex; - else // Default image is read_only - O << "\t.param .texref " << *getSymbol(F) << "_param_" - << paramIndex; - } else // Should be llvm::isSampler(*I) - O << "\t.param .samplerref " << *getSymbol(F) << "_param_" - << paramIndex; - continue; + if (isKernelFunction(*F)) { + if (isSampler(*I) || isImage(*I)) { + if (isImage(*I)) { + std::string sname = I->getName(); + if (isImageWriteOnly(*I) || isImageReadWrite(*I)) { + if (nvptxSubtarget.hasImageHandles()) + O << "\t.param .u64 .ptr .surfref "; + else + O << "\t.param .surfref "; + O << *CurrentFnSym << "_param_" << paramIndex; + } + else { // Default image is read_only + if (nvptxSubtarget.hasImageHandles()) + O << "\t.param .u64 .ptr .texref "; + else + O << "\t.param .texref "; + O << *CurrentFnSym << "_param_" << paramIndex; + } + } else { + if (nvptxSubtarget.hasImageHandles()) + O << "\t.param .u64 .ptr .samplerref "; + else + O << "\t.param .samplerref "; + O << *CurrentFnSym << "_param_" << paramIndex; + } + continue; + } } if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) { - if (Ty->isVectorTy()) { - // Just print .param .b8 .align <a> .param[size]; + if (Ty->isAggregateType() || Ty->isVectorTy()) { + // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); @@ -1580,7 +1686,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { continue; } // Non-kernel function, just print .param .b<size> for ABI - // and .reg .b<size> for non ABY + // and .reg .b<size> for non-ABI unsigned sz = 0; if (isa<IntegerType>(Ty)) { sz = cast<IntegerType>(Ty)->getBitWidth(); @@ -1604,7 +1710,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { Type *ETy = PTy->getElementType(); if (isABI || isKernelFunc) { - // Just print .param .b8 .align <a> .param[size]; + // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); @@ -1702,9 +1808,9 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; - // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n"; // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; - // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n"; // Emit declaration of the virtual registers or 'physical' registers for // each register class @@ -1764,13 +1870,35 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) { return; } if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { - O << *getSymbol(GVar); + PointerType *PTy = dyn_cast<PointerType>(GVar->getType()); + bool IsNonGenericPointer = false; + if (PTy && PTy->getAddressSpace() != 0) { + IsNonGenericPointer = true; + } + if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) { + O << "generic("; + O << *getSymbol(GVar); + O << ")"; + } else { + O << *getSymbol(GVar); + } return; } if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { const Value *v = Cexpr->stripPointerCasts(); + PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType()); + bool IsNonGenericPointer = false; + if (PTy && PTy->getAddressSpace() != 0) { + IsNonGenericPointer = true; + } if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { - O << *getSymbol(GVar); + if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) { + O << "generic("; + O << *getSymbol(GVar); + O << ")"; + } else { + O << *getSymbol(GVar); + } return; } else { O << *LowerConstant(CPV, *this); @@ -2087,21 +2215,6 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, O << *getSymbol(MO.getGlobal()); break; - case MachineOperand::MO_ExternalSymbol: { - const char *symbname = MO.getSymbolName(); - if (strstr(symbname, ".PARAM") == symbname) { - unsigned index; - sscanf(symbname + 6, "%u[];", &index); - printParamName(index, O); - } else if (strstr(symbname, ".HLPPARAM") == symbname) { - unsigned index; - sscanf(symbname + 9, "%u[];", &index); - O << *CurrentFnSym << "_param_" << index << "_offset"; - } else - O << symbname; - break; - } - case MachineOperand::MO_MachineBasicBlock: O << *MO.getMBB()->getSymbol(); return; @@ -2148,7 +2261,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) { } LineReader *NVPTXAsmPrinter::getReader(std::string filename) { - if (reader == NULL) { + if (!reader) { reader = new LineReader(filename); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index 3abe5d166826..a9f9bdd6d3d8 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -27,7 +27,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/Mangler.h" #include "llvm/Target/TargetMachine.h" #include <fstream> @@ -97,6 +96,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { unsigned curpos; raw_ostream &O; NVPTXAsmPrinter &AP; + bool EmitGeneric; public: AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP) @@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { size = _size; curpos = 0; numSymbols = 0; + EmitGeneric = AP.EmitGeneric; } ~AggBuffer() { delete[] buffer; } unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) { @@ -156,7 +157,18 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { const Value *v = Symbols[nSym]; if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { MCSymbol *Name = AP.getSymbol(GVar); - O << *Name; + PointerType *PTy = dyn_cast<PointerType>(GVar->getType()); + bool IsNonGenericPointer = false; + if (PTy && PTy->getAddressSpace() != 0) { + IsNonGenericPointer = true; + } + if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) { + O << "generic("; + O << *Name; + O << ")"; + } else { + O << *Name; + } } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) { O << *nvptx::LowerConstant(Cexpr, AP); } else @@ -177,35 +189,32 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { friend class AggBuffer; - virtual void emitSrcInText(StringRef filename, unsigned line); + void emitSrcInText(StringRef filename, unsigned line); private: - virtual const char *getPassName() const { return "NVPTX Assembly Printer"; } + const char *getPassName() const override { return "NVPTX Assembly Printer"; } const Function *F; std::string CurrentFnName; - void EmitFunctionEntryLabel(); - void EmitFunctionBodyStart(); - void EmitFunctionBodyEnd(); - void emitImplicitDef(const MachineInstr *MI) const; + void EmitFunctionEntryLabel() override; + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + void emitImplicitDef(const MachineInstr *MI) const override; - void EmitInstruction(const MachineInstr *); + void EmitInstruction(const MachineInstr *) override; void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI); bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); - MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); + MCOperand GetSymbolRef(const MCSymbol *Symbol); unsigned encodeVirtualRegister(unsigned Reg); - void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {} + void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {} - void printGlobalVariable(const GlobalVariable *GVar); void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier, raw_ostream &O); void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; - // definition autogenerated. - void printInstruction(const MachineInstr *MI, raw_ostream &O); void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, bool = false); void printParamName(int paramIndex, raw_ostream &O); @@ -225,15 +234,15 @@ private: void printReturnValStr(const MachineFunction &MF, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, - raw_ostream &); + raw_ostream &) override; void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, - raw_ostream &); + raw_ostream &) override; protected: - bool doInitialization(Module &M); - bool doFinalization(Module &M); + bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; private: std::string CurrentBankselLabelInBasicBlock; @@ -278,14 +287,33 @@ private: static const char *getRegisterName(unsigned RegNo); void emitDemotedVars(const Function *, raw_ostream &); + bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo, + MCOperand &MCOp); + void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp); + LineReader *reader; LineReader *getReader(std::string); + + // Used to control the need to emit .generic() in the initializer of + // module scope variables. + // Although ptx supports the hybrid mode like the following, + // .global .u32 a; + // .global .u32 b; + // .global .u32 addr[] = {a, generic(b)} + // we have difficulty representing the difference in the NVVM IR. + // + // Since the address value should always be generic in CUDA C and always + // be specific in OpenCL, we use this simple control here. + // + bool EmitGeneric; + public: NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) : AsmPrinter(TM, Streamer), nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) { CurrentBankselLabelInBasicBlock = ""; - reader = NULL; + reader = nullptr; + EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA); } ~NVPTXAsmPrinter() { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp new file mode 100644 index 000000000000..962b12312026 --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -0,0 +1,84 @@ +//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Clean up the names of global variables in the module to not contain symbols +// that are invalid in PTX. +// +// Currently NVPTX, like other backends, relies on generic symbol name +// sanitizing done by MC. However, the ptxas assembler is more stringent and +// disallows some additional characters in symbol names. This pass makes sure +// such names do not reach MC at all. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include <string> + +using namespace llvm; + +namespace { +/// \brief NVPTXAssignValidGlobalNames +class NVPTXAssignValidGlobalNames : public ModulePass { +public: + static char ID; + NVPTXAssignValidGlobalNames() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + + /// \brief Clean up the name to remove symbols invalid in PTX. + std::string cleanUpName(StringRef Name); +}; +} + +char NVPTXAssignValidGlobalNames::ID = 0; + +namespace llvm { +void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); +} + +INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names", + "Assign valid PTX names to globals", false, false) + +bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { + for (GlobalVariable &GV : M.globals()) { + // We are only allowed to rename local symbols. + if (GV.hasLocalLinkage()) { + // setName doesn't do extra work if the name does not change. + // Note: this does not create collisions - if setName is asked to set the + // name to something that already exists, it adds a proper postfix to + // avoid collisions. + GV.setName(cleanUpName(GV.getName())); + } + } + + return true; +} + +std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) { + std::string ValidName; + raw_string_ostream ValidNameStream(ValidName); + for (unsigned I = 0, E = Name.size(); I != E; ++I) { + char C = Name[I]; + if (C == '.' || C == '@') { + ValidNameStream << "_$_"; + } else { + ValidNameStream << C; + } + } + + return ValidNameStream.str(); +} + +ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() { + return new NVPTXAssignValidGlobalNames(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp new file mode 100644 index 000000000000..f3a095d829e4 --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -0,0 +1,195 @@ +//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// When a load/store accesses the generic address space, checks whether the +// address is casted from a non-generic address space. If so, remove this +// addrspacecast because accessing non-generic address spaces is typically +// faster. Besides seeking addrspacecasts, this optimization also traces into +// the base pointer of a GEP. +// +// For instance, the code below loads a float from an array allocated in +// addrspace(3). +// +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* +// %1 = gep [10 x float]* %0, i64 0, i64 %i +// %2 = load float* %1 ; emits ld.f32 +// +// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast +// and the GEP to expose more optimization opportunities to function +// optimizeMemoryInst. The intermediate code looks like: +// +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %1 = addrspacecast float addrspace(3)* %0 to float* +// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly +// +// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed +// generic pointers, and folds the load and the addrspacecast into a load from +// the original address space. The final code looks like: +// +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32 +// +// This pass may remove an addrspacecast in a different BB. Therefore, we +// implement it as a FunctionPass. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +// An option to disable this optimization. Enable it by default. +static cl::opt<bool> DisableFavorNonGeneric( + "disable-nvptx-favor-non-generic", + cl::init(false), + cl::desc("Do not convert generic address space usage " + "to non-generic address space usage"), + cl::Hidden); + +namespace { +/// \brief NVPTXFavorNonGenericAddrSpaces +class NVPTXFavorNonGenericAddrSpaces : public FunctionPass { +public: + static char ID; + NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + /// Optimizes load/store instructions. Idx is the index of the pointer operand + /// (0 for load, and 1 for store). Returns true if it changes anything. + bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); + /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, + /// indices)". This reordering exposes to optimizeMemoryInstruction more + /// optimization opportunities on loads and stores. Returns true if it changes + /// the program. + bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP); +}; +} + +char NVPTXFavorNonGenericAddrSpaces::ID = 0; + +namespace llvm { +void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); +} +INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic", + "Remove unnecessary non-generic-to-generic addrspacecasts", + false, false) + +// Decides whether removing Cast is valid and beneficial. Cast can be an +// instruction or a constant expression. +static bool IsEliminableAddrSpaceCast(Operator *Cast) { + // Returns false if not even an addrspacecast. + if (Cast->getOpcode() != Instruction::AddrSpaceCast) + return false; + + Value *Src = Cast->getOperand(0); + PointerType *SrcTy = cast<PointerType>(Src->getType()); + PointerType *DestTy = cast<PointerType>(Cast->getType()); + // TODO: For now, we only handle the case where the addrspacecast only changes + // the address space but not the type. If the type also changes, we could + // still get rid of the addrspacecast by adding an extra bitcast, but we + // rarely see such scenarios. + if (SrcTy->getElementType() != DestTy->getElementType()) + return false; + + // Checks whether the addrspacecast is from a non-generic address space to the + // generic address space. + return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC && + DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); +} + +bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( + GEPOperator *GEP) { + Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand()); + if (!Cast) + return false; + + if (!IsEliminableAddrSpaceCast(Cast)) + return false; + + SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end()); + if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) { + // %1 = gep (addrspacecast X), indices + // => + // %0 = gep X, indices + // %1 = addrspacecast %0 + GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0), + Indices, + GEP->getName(), + GEPI); + NewGEPI->setIsInBounds(GEP->isInBounds()); + GEP->replaceAllUsesWith( + new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); + } else { + // GEP is a constant expression. + Constant *NewGEPCE = ConstantExpr::getGetElementPtr( + cast<Constant>(Cast->getOperand(0)), + Indices, + GEP->isInBounds()); + GEP->replaceAllUsesWith( + ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType())); + } + + return true; +} + +bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, + unsigned Idx) { + // If the pointer operand is a GEP, hoist the addrspacecast if any from the + // GEP to expose more optimization opportunites. + if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) { + hoistAddrSpaceCastFromGEP(GEP); + } + + // load/store (addrspacecast X) => load/store X if shortcutting the + // addrspacecast is valid and can improve performance. + // + // e.g., + // %1 = addrspacecast float addrspace(3)* %0 to float* + // %2 = load float* %1 + // -> + // %2 = load float addrspace(3)* %0 + // + // Note: the addrspacecast can also be a constant expression. + if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) { + if (IsEliminableAddrSpaceCast(Cast)) { + MI->setOperand(Idx, Cast->getOperand(0)); + return true; + } + } + + return false; +} + +bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { + if (DisableFavorNonGeneric) + return false; + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) { + if (isa<LoadInst>(I)) { + // V = load P + Changed |= optimizeMemoryInstruction(I, 0); + } else if (isa<StoreInst>(I)) { + // store V, P + Changed |= optimizeMemoryInstruction(I, 1); + } + } + } + return Changed; +} + +FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() { + return new NVPTXFavorNonGenericAddrSpaces(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9030584f06fc..8b088412dbba 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -26,6 +26,10 @@ using namespace llvm; +NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), + is64bit(STI.is64Bit()) {} + bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { // cvta.local %SP, %SPL; if (is64bit) { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), LocalReg).addImm(MF.getFunctionNumber()); } else { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), LocalReg).addImm(MF.getFunctionNumber()); } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h index 819f1dd3f4be..56fb673de0eb 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -17,24 +17,20 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { -class NVPTXTargetMachine; - +class NVPTXSubtarget; class NVPTXFrameLowering : public TargetFrameLowering { - NVPTXTargetMachine &tm; bool is64bit; public: - explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit) - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm), - is64bit(_is64bit) {} + explicit NVPTXFrameLowering(NVPTXSubtarget &STI); - virtual bool hasFP(const MachineFunction &MF) const; - virtual void emitPrologue(MachineFunction &MF) const; - virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + bool hasFP(const MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 9fb0dd88b758..faa9fdb424b6 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -13,20 +13,19 @@ //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "NVPTXUtilities.h" #include "MCTargetDesc/NVPTXBaseInfo.h" - -#include "llvm/PassManager.h" +#include "NVPTXUtilities.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" -#include "llvm/ADT/ValueMap.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/PassManager.h" using namespace llvm; @@ -41,10 +40,9 @@ public: GenericToNVVM() : ModulePass(ID) {} - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - } + void getAnalysisUsage(AnalysisUsage &AU) const override {} private: Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV, @@ -64,7 +62,7 @@ private: GVMapTy GVMap; ConstantToValueMapTy ConstantToValueMap; }; -} +} // end namespace char GenericToNVVM::ID = 0; @@ -86,10 +84,11 @@ bool GenericToNVVM::runOnModule(Module &M) { GlobalVariable *GV = I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && - !GV->getName().startswith("llvm.")) { + !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { GlobalVariable *NewGV = new GlobalVariable( M, GV->getType()->getElementType(), GV->isConstant(), - GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL, + GV->getLinkage(), + GV->hasInitializer() ? GV->getInitializer() : nullptr, "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL); NewGV->copyAttributesFrom(GV); GVMap[GV] = NewGV; @@ -147,10 +146,8 @@ bool GenericToNVVM::runOnModule(Module &M) { // variable initializers, as other uses have been already been removed // while walking through the instructions in function definitions. for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end(); - UI != UE;) { - Use &U = (UI++).getUse(); - U.set(BitCastNewGV); - } + UI != UE;) + (UI++)->set(BitCastNewGV); std::string Name = GV->getName(); GV->removeDeadConstantUsers(); GV->eraseFromParent(); @@ -165,7 +162,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV, IRBuilder<> &Builder) { PointerType *GVType = GV->getType(); - Value *CVTA = NULL; + Value *CVTA = nullptr; // See if the address space conversion requires the operand to be bitcast // to i8 addrspace(n)* first. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4b8b306a705a..05205fba1aff 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -20,16 +20,9 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" -#undef DEBUG_TYPE -#define DEBUG_TYPE "nvptx-isel" - using namespace llvm; -static cl::opt<int> -FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); +#define DEBUG_TYPE "nvptx-isel" static cl::opt<int> UsePrecDivF32( "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, @@ -59,16 +52,6 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), Subtarget(tm.getSubtarget<NVPTXSubtarget>()) { - - doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1); - doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1); - doFMAF32AGG = - (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2); - doFMAF64AGG = - (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2); - - allowFMA = (FMAContractLevel >= 1); - doMulWide = (OptLevel > 0); } @@ -114,16 +97,21 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const { } } +bool NVPTXDAGToDAGISel::allowFMA() const { + const NVPTXTargetLowering *TL = Subtarget.getTargetLowering(); + return TL->allowFMA(*MF, OptLevel); +} + /// Select - Select instructions not customized! Used for /// expanded, promoted and normal instructions. SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { if (N->isMachineOpcode()) { N->setNodeId(-1); - return NULL; // Already selected. + return nullptr; // Already selected. } - SDNode *ResNode = NULL; + SDNode *ResNode = nullptr; switch (N->getOpcode()) { case ISD::LOAD: ResNode = SelectLoad(N); @@ -139,7 +127,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - ResNode = SelectLDGLDUVector(N); + ResNode = SelectLDGLDU(N); break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -162,6 +150,358 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::StoreParamU32: ResNode = SelectStoreParam(N); break; + case ISD::INTRINSIC_WO_CHAIN: + ResNode = SelectIntrinsicNoChain(N); + break; + case ISD::INTRINSIC_W_CHAIN: + ResNode = SelectIntrinsicChain(N); + break; + case NVPTXISD::Tex1DFloatS32: + case NVPTXISD::Tex1DFloatFloat: + case NVPTXISD::Tex1DFloatFloatLevel: + case NVPTXISD::Tex1DFloatFloatGrad: + case NVPTXISD::Tex1DS32S32: + case NVPTXISD::Tex1DS32Float: + case NVPTXISD::Tex1DS32FloatLevel: + case NVPTXISD::Tex1DS32FloatGrad: + case NVPTXISD::Tex1DU32S32: + case NVPTXISD::Tex1DU32Float: + case NVPTXISD::Tex1DU32FloatLevel: + case NVPTXISD::Tex1DU32FloatGrad: + case NVPTXISD::Tex1DArrayFloatS32: + case NVPTXISD::Tex1DArrayFloatFloat: + case NVPTXISD::Tex1DArrayFloatFloatLevel: + case NVPTXISD::Tex1DArrayFloatFloatGrad: + case NVPTXISD::Tex1DArrayS32S32: + case NVPTXISD::Tex1DArrayS32Float: + case NVPTXISD::Tex1DArrayS32FloatLevel: + case NVPTXISD::Tex1DArrayS32FloatGrad: + case NVPTXISD::Tex1DArrayU32S32: + case NVPTXISD::Tex1DArrayU32Float: + case NVPTXISD::Tex1DArrayU32FloatLevel: + case NVPTXISD::Tex1DArrayU32FloatGrad: + case NVPTXISD::Tex2DFloatS32: + case NVPTXISD::Tex2DFloatFloat: + case NVPTXISD::Tex2DFloatFloatLevel: + case NVPTXISD::Tex2DFloatFloatGrad: + case NVPTXISD::Tex2DS32S32: + case NVPTXISD::Tex2DS32Float: + case NVPTXISD::Tex2DS32FloatLevel: + case NVPTXISD::Tex2DS32FloatGrad: + case NVPTXISD::Tex2DU32S32: + case NVPTXISD::Tex2DU32Float: + case NVPTXISD::Tex2DU32FloatLevel: + case NVPTXISD::Tex2DU32FloatGrad: + case NVPTXISD::Tex2DArrayFloatS32: + case NVPTXISD::Tex2DArrayFloatFloat: + case NVPTXISD::Tex2DArrayFloatFloatLevel: + case NVPTXISD::Tex2DArrayFloatFloatGrad: + case NVPTXISD::Tex2DArrayS32S32: + case NVPTXISD::Tex2DArrayS32Float: + case NVPTXISD::Tex2DArrayS32FloatLevel: + case NVPTXISD::Tex2DArrayS32FloatGrad: + case NVPTXISD::Tex2DArrayU32S32: + case NVPTXISD::Tex2DArrayU32Float: + case NVPTXISD::Tex2DArrayU32FloatLevel: + case NVPTXISD::Tex2DArrayU32FloatGrad: + case NVPTXISD::Tex3DFloatS32: + case NVPTXISD::Tex3DFloatFloat: + case NVPTXISD::Tex3DFloatFloatLevel: + case NVPTXISD::Tex3DFloatFloatGrad: + case NVPTXISD::Tex3DS32S32: + case NVPTXISD::Tex3DS32Float: + case NVPTXISD::Tex3DS32FloatLevel: + case NVPTXISD::Tex3DS32FloatGrad: + case NVPTXISD::Tex3DU32S32: + case NVPTXISD::Tex3DU32Float: + case NVPTXISD::Tex3DU32FloatLevel: + case NVPTXISD::Tex3DU32FloatGrad: + case NVPTXISD::TexCubeFloatFloat: + case NVPTXISD::TexCubeFloatFloatLevel: + case NVPTXISD::TexCubeS32Float: + case NVPTXISD::TexCubeS32FloatLevel: + case NVPTXISD::TexCubeU32Float: + case NVPTXISD::TexCubeU32FloatLevel: + case NVPTXISD::TexCubeArrayFloatFloat: + case NVPTXISD::TexCubeArrayFloatFloatLevel: + case NVPTXISD::TexCubeArrayS32Float: + case NVPTXISD::TexCubeArrayS32FloatLevel: + case NVPTXISD::TexCubeArrayU32Float: + case NVPTXISD::TexCubeArrayU32FloatLevel: + case NVPTXISD::Tld4R2DFloatFloat: + case NVPTXISD::Tld4G2DFloatFloat: + case NVPTXISD::Tld4B2DFloatFloat: + case NVPTXISD::Tld4A2DFloatFloat: + case NVPTXISD::Tld4R2DS64Float: + case NVPTXISD::Tld4G2DS64Float: + case NVPTXISD::Tld4B2DS64Float: + case NVPTXISD::Tld4A2DS64Float: + case NVPTXISD::Tld4R2DU64Float: + case NVPTXISD::Tld4G2DU64Float: + case NVPTXISD::Tld4B2DU64Float: + case NVPTXISD::Tld4A2DU64Float: + case NVPTXISD::TexUnified1DFloatS32: + case NVPTXISD::TexUnified1DFloatFloat: + case NVPTXISD::TexUnified1DFloatFloatLevel: + case NVPTXISD::TexUnified1DFloatFloatGrad: + case NVPTXISD::TexUnified1DS32S32: + case NVPTXISD::TexUnified1DS32Float: + case NVPTXISD::TexUnified1DS32FloatLevel: + case NVPTXISD::TexUnified1DS32FloatGrad: + case NVPTXISD::TexUnified1DU32S32: + case NVPTXISD::TexUnified1DU32Float: + case NVPTXISD::TexUnified1DU32FloatLevel: + case NVPTXISD::TexUnified1DU32FloatGrad: + case NVPTXISD::TexUnified1DArrayFloatS32: + case NVPTXISD::TexUnified1DArrayFloatFloat: + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + case NVPTXISD::TexUnified1DArrayS32S32: + case NVPTXISD::TexUnified1DArrayS32Float: + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + case NVPTXISD::TexUnified1DArrayU32S32: + case NVPTXISD::TexUnified1DArrayU32Float: + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + case NVPTXISD::TexUnified2DFloatS32: + case NVPTXISD::TexUnified2DFloatFloat: + case NVPTXISD::TexUnified2DFloatFloatLevel: + case NVPTXISD::TexUnified2DFloatFloatGrad: + case NVPTXISD::TexUnified2DS32S32: + case NVPTXISD::TexUnified2DS32Float: + case NVPTXISD::TexUnified2DS32FloatLevel: + case NVPTXISD::TexUnified2DS32FloatGrad: + case NVPTXISD::TexUnified2DU32S32: + case NVPTXISD::TexUnified2DU32Float: + case NVPTXISD::TexUnified2DU32FloatLevel: + case NVPTXISD::TexUnified2DU32FloatGrad: + case NVPTXISD::TexUnified2DArrayFloatS32: + case NVPTXISD::TexUnified2DArrayFloatFloat: + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + case NVPTXISD::TexUnified2DArrayS32S32: + case NVPTXISD::TexUnified2DArrayS32Float: + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + case NVPTXISD::TexUnified2DArrayU32S32: + case NVPTXISD::TexUnified2DArrayU32Float: + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + case NVPTXISD::TexUnified3DFloatS32: + case NVPTXISD::TexUnified3DFloatFloat: + case NVPTXISD::TexUnified3DFloatFloatLevel: + case NVPTXISD::TexUnified3DFloatFloatGrad: + case NVPTXISD::TexUnified3DS32S32: + case NVPTXISD::TexUnified3DS32Float: + case NVPTXISD::TexUnified3DS32FloatLevel: + case NVPTXISD::TexUnified3DS32FloatGrad: + case NVPTXISD::TexUnified3DU32S32: + case NVPTXISD::TexUnified3DU32Float: + case NVPTXISD::TexUnified3DU32FloatLevel: + case NVPTXISD::TexUnified3DU32FloatGrad: + case NVPTXISD::TexUnifiedCubeFloatFloat: + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + case NVPTXISD::TexUnifiedCubeS32Float: + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + case NVPTXISD::TexUnifiedCubeU32Float: + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + case NVPTXISD::TexUnifiedCubeArrayS32Float: + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + case NVPTXISD::TexUnifiedCubeArrayU32Float: + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + case NVPTXISD::Tld4UnifiedR2DS64Float: + case NVPTXISD::Tld4UnifiedG2DS64Float: + case NVPTXISD::Tld4UnifiedB2DS64Float: + case NVPTXISD::Tld4UnifiedA2DS64Float: + case NVPTXISD::Tld4UnifiedR2DU64Float: + case NVPTXISD::Tld4UnifiedG2DU64Float: + case NVPTXISD::Tld4UnifiedB2DU64Float: + case NVPTXISD::Tld4UnifiedA2DU64Float: + ResNode = SelectTextureIntrinsic(N); + break; + case NVPTXISD::Suld1DI8Clamp: + case NVPTXISD::Suld1DI16Clamp: + case NVPTXISD::Suld1DI32Clamp: + case NVPTXISD::Suld1DI64Clamp: + case NVPTXISD::Suld1DV2I8Clamp: + case NVPTXISD::Suld1DV2I16Clamp: + case NVPTXISD::Suld1DV2I32Clamp: + case NVPTXISD::Suld1DV2I64Clamp: + case NVPTXISD::Suld1DV4I8Clamp: + case NVPTXISD::Suld1DV4I16Clamp: + case NVPTXISD::Suld1DV4I32Clamp: + case NVPTXISD::Suld1DArrayI8Clamp: + case NVPTXISD::Suld1DArrayI16Clamp: + case NVPTXISD::Suld1DArrayI32Clamp: + case NVPTXISD::Suld1DArrayI64Clamp: + case NVPTXISD::Suld1DArrayV2I8Clamp: + case NVPTXISD::Suld1DArrayV2I16Clamp: + case NVPTXISD::Suld1DArrayV2I32Clamp: + case NVPTXISD::Suld1DArrayV2I64Clamp: + case NVPTXISD::Suld1DArrayV4I8Clamp: + case NVPTXISD::Suld1DArrayV4I16Clamp: + case NVPTXISD::Suld1DArrayV4I32Clamp: + case NVPTXISD::Suld2DI8Clamp: + case NVPTXISD::Suld2DI16Clamp: + case NVPTXISD::Suld2DI32Clamp: + case NVPTXISD::Suld2DI64Clamp: + case NVPTXISD::Suld2DV2I8Clamp: + case NVPTXISD::Suld2DV2I16Clamp: + case NVPTXISD::Suld2DV2I32Clamp: + case NVPTXISD::Suld2DV2I64Clamp: + case NVPTXISD::Suld2DV4I8Clamp: + case NVPTXISD::Suld2DV4I16Clamp: + case NVPTXISD::Suld2DV4I32Clamp: + case NVPTXISD::Suld2DArrayI8Clamp: + case NVPTXISD::Suld2DArrayI16Clamp: + case NVPTXISD::Suld2DArrayI32Clamp: + case NVPTXISD::Suld2DArrayI64Clamp: + case NVPTXISD::Suld2DArrayV2I8Clamp: + case NVPTXISD::Suld2DArrayV2I16Clamp: + case NVPTXISD::Suld2DArrayV2I32Clamp: + case NVPTXISD::Suld2DArrayV2I64Clamp: + case NVPTXISD::Suld2DArrayV4I8Clamp: + case NVPTXISD::Suld2DArrayV4I16Clamp: + case NVPTXISD::Suld2DArrayV4I32Clamp: + case NVPTXISD::Suld3DI8Clamp: + case NVPTXISD::Suld3DI16Clamp: + case NVPTXISD::Suld3DI32Clamp: + case NVPTXISD::Suld3DI64Clamp: + case NVPTXISD::Suld3DV2I8Clamp: + case NVPTXISD::Suld3DV2I16Clamp: + case NVPTXISD::Suld3DV2I32Clamp: + case NVPTXISD::Suld3DV2I64Clamp: + case NVPTXISD::Suld3DV4I8Clamp: + case NVPTXISD::Suld3DV4I16Clamp: + case NVPTXISD::Suld3DV4I32Clamp: + case NVPTXISD::Suld1DI8Trap: + case NVPTXISD::Suld1DI16Trap: + case NVPTXISD::Suld1DI32Trap: + case NVPTXISD::Suld1DI64Trap: + case NVPTXISD::Suld1DV2I8Trap: + case NVPTXISD::Suld1DV2I16Trap: + case NVPTXISD::Suld1DV2I32Trap: + case NVPTXISD::Suld1DV2I64Trap: + case NVPTXISD::Suld1DV4I8Trap: + case NVPTXISD::Suld1DV4I16Trap: + case NVPTXISD::Suld1DV4I32Trap: + case NVPTXISD::Suld1DArrayI8Trap: + case NVPTXISD::Suld1DArrayI16Trap: + case NVPTXISD::Suld1DArrayI32Trap: + case NVPTXISD::Suld1DArrayI64Trap: + case NVPTXISD::Suld1DArrayV2I8Trap: + case NVPTXISD::Suld1DArrayV2I16Trap: + case NVPTXISD::Suld1DArrayV2I32Trap: + case NVPTXISD::Suld1DArrayV2I64Trap: + case NVPTXISD::Suld1DArrayV4I8Trap: + case NVPTXISD::Suld1DArrayV4I16Trap: + case NVPTXISD::Suld1DArrayV4I32Trap: + case NVPTXISD::Suld2DI8Trap: + case NVPTXISD::Suld2DI16Trap: + case NVPTXISD::Suld2DI32Trap: + case NVPTXISD::Suld2DI64Trap: + case NVPTXISD::Suld2DV2I8Trap: + case NVPTXISD::Suld2DV2I16Trap: + case NVPTXISD::Suld2DV2I32Trap: + case NVPTXISD::Suld2DV2I64Trap: + case NVPTXISD::Suld2DV4I8Trap: + case NVPTXISD::Suld2DV4I16Trap: + case NVPTXISD::Suld2DV4I32Trap: + case NVPTXISD::Suld2DArrayI8Trap: + case NVPTXISD::Suld2DArrayI16Trap: + case NVPTXISD::Suld2DArrayI32Trap: + case NVPTXISD::Suld2DArrayI64Trap: + case NVPTXISD::Suld2DArrayV2I8Trap: + case NVPTXISD::Suld2DArrayV2I16Trap: + case NVPTXISD::Suld2DArrayV2I32Trap: + case NVPTXISD::Suld2DArrayV2I64Trap: + case NVPTXISD::Suld2DArrayV4I8Trap: + case NVPTXISD::Suld2DArrayV4I16Trap: + case NVPTXISD::Suld2DArrayV4I32Trap: + case NVPTXISD::Suld3DI8Trap: + case NVPTXISD::Suld3DI16Trap: + case NVPTXISD::Suld3DI32Trap: + case NVPTXISD::Suld3DI64Trap: + case NVPTXISD::Suld3DV2I8Trap: + case NVPTXISD::Suld3DV2I16Trap: + case NVPTXISD::Suld3DV2I32Trap: + case NVPTXISD::Suld3DV2I64Trap: + case NVPTXISD::Suld3DV4I8Trap: + case NVPTXISD::Suld3DV4I16Trap: + case NVPTXISD::Suld3DV4I32Trap: + case NVPTXISD::Suld1DI8Zero: + case NVPTXISD::Suld1DI16Zero: + case NVPTXISD::Suld1DI32Zero: + case NVPTXISD::Suld1DI64Zero: + case NVPTXISD::Suld1DV2I8Zero: + case NVPTXISD::Suld1DV2I16Zero: + case NVPTXISD::Suld1DV2I32Zero: + case NVPTXISD::Suld1DV2I64Zero: + case NVPTXISD::Suld1DV4I8Zero: + case NVPTXISD::Suld1DV4I16Zero: + case NVPTXISD::Suld1DV4I32Zero: + case NVPTXISD::Suld1DArrayI8Zero: + case NVPTXISD::Suld1DArrayI16Zero: + case NVPTXISD::Suld1DArrayI32Zero: + case NVPTXISD::Suld1DArrayI64Zero: + case NVPTXISD::Suld1DArrayV2I8Zero: + case NVPTXISD::Suld1DArrayV2I16Zero: + case NVPTXISD::Suld1DArrayV2I32Zero: + case NVPTXISD::Suld1DArrayV2I64Zero: + case NVPTXISD::Suld1DArrayV4I8Zero: + case NVPTXISD::Suld1DArrayV4I16Zero: + case NVPTXISD::Suld1DArrayV4I32Zero: + case NVPTXISD::Suld2DI8Zero: + case NVPTXISD::Suld2DI16Zero: + case NVPTXISD::Suld2DI32Zero: + case NVPTXISD::Suld2DI64Zero: + case NVPTXISD::Suld2DV2I8Zero: + case NVPTXISD::Suld2DV2I16Zero: + case NVPTXISD::Suld2DV2I32Zero: + case NVPTXISD::Suld2DV2I64Zero: + case NVPTXISD::Suld2DV4I8Zero: + case NVPTXISD::Suld2DV4I16Zero: + case NVPTXISD::Suld2DV4I32Zero: + case NVPTXISD::Suld2DArrayI8Zero: + case NVPTXISD::Suld2DArrayI16Zero: + case NVPTXISD::Suld2DArrayI32Zero: + case NVPTXISD::Suld2DArrayI64Zero: + case NVPTXISD::Suld2DArrayV2I8Zero: + case NVPTXISD::Suld2DArrayV2I16Zero: + case NVPTXISD::Suld2DArrayV2I32Zero: + case NVPTXISD::Suld2DArrayV2I64Zero: + case NVPTXISD::Suld2DArrayV4I8Zero: + case NVPTXISD::Suld2DArrayV4I16Zero: + case NVPTXISD::Suld2DArrayV4I32Zero: + case NVPTXISD::Suld3DI8Zero: + case NVPTXISD::Suld3DI16Zero: + case NVPTXISD::Suld3DI32Zero: + case NVPTXISD::Suld3DI64Zero: + case NVPTXISD::Suld3DV2I8Zero: + case NVPTXISD::Suld3DV2I16Zero: + case NVPTXISD::Suld3DV2I32Zero: + case NVPTXISD::Suld3DV2I64Zero: + case NVPTXISD::Suld3DV4I8Zero: + case NVPTXISD::Suld3DV4I16Zero: + case NVPTXISD::Suld3DV4I32Zero: + ResNode = SelectSurfaceIntrinsic(N); + break; + case ISD::AND: + case ISD::SRA: + case ISD::SRL: + // Try to select BFE + ResNode = SelectBFE(N); + break; + case ISD::ADDRSPACECAST: + ResNode = SelectAddrSpaceCast(N); + break; default: break; } @@ -170,9 +510,24 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } +SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) { + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + return SelectLDGLDU(N); + } +} + static unsigned int getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) { - const Value *Src = N->getSrcValue(); + const Value *Src = N->getMemOperand()->getValue(); if (!Src) return NVPTX::PTXLdStInstCode::GENERIC; @@ -191,18 +546,96 @@ static unsigned int getCodeAddrSpace(MemSDNode *N, return NVPTX::PTXLdStInstCode::GENERIC; } +SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IID) { + default: + return nullptr; + case Intrinsic::nvvm_texsurf_handle_internal: + return SelectTexSurfHandle(N); + } +} + +SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { + // Op 0 is the intrinsic ID + SDValue Wrapper = N->getOperand(1); + SDValue GlobalVal = Wrapper.getOperand(0); + return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64, + GlobalVal); +} + +SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + SDValue Src = N->getOperand(0); + AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); + unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); + unsigned DstAddrSpace = CastN->getDestAddressSpace(); + + assert(SrcAddrSpace != DstAddrSpace && + "addrspacecast must be between different address spaces"); + + if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { + // Specific to generic + unsigned Opc; + switch (SrcAddrSpace) { + default: report_fatal_error("Bad address space in addrspacecast"); + case ADDRESS_SPACE_GLOBAL: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_global_yes_64 + : NVPTX::cvta_global_yes; + break; + case ADDRESS_SPACE_SHARED: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_shared_yes_64 + : NVPTX::cvta_shared_yes; + break; + case ADDRESS_SPACE_CONST: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_const_yes_64 + : NVPTX::cvta_const_yes; + break; + case ADDRESS_SPACE_LOCAL: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_local_yes_64 + : NVPTX::cvta_local_yes; + break; + } + return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src); + } else { + // Generic to specific + if (SrcAddrSpace != 0) + report_fatal_error("Cannot cast between two non-generic address spaces"); + unsigned Opc; + switch (DstAddrSpace) { + default: report_fatal_error("Bad address space in addrspacecast"); + case ADDRESS_SPACE_GLOBAL: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_global_yes_64 + : NVPTX::cvta_to_global_yes; + break; + case ADDRESS_SPACE_SHARED: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_shared_yes_64 + : NVPTX::cvta_to_shared_yes; + break; + case ADDRESS_SPACE_CONST: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_const_yes_64 + : NVPTX::cvta_to_const_yes; + break; + case ADDRESS_SPACE_LOCAL: + Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_local_yes_64 + : NVPTX::cvta_to_local_yes; + break; + } + return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src); + } +} + SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { SDLoc dl(N); LoadSDNode *LD = cast<LoadSDNode>(N); EVT LoadedVT = LD->getMemoryVT(); - SDNode *NVPTXLD = NULL; + SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec if (LD->isIndexed()) - return NULL; + return nullptr; if (!LoadedVT.isSimple()) - return NULL; + return nullptr; // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget); @@ -225,7 +658,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { else if (num == 4) vecType = NVPTX::PTXLdStInstCode::V4; else - return NULL; + return nullptr; } // Type Setting: fromType + fromTypeWidth @@ -274,7 +707,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_avar; break; default: - return NULL; + return nullptr; } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), getI32Imm(vecType), getI32Imm(fromType), @@ -303,7 +736,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_asi; break; default: - return NULL; + return nullptr; } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), getI32Imm(vecType), getI32Imm(fromType), @@ -333,7 +766,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_ari_64; break; default: - return NULL; + return nullptr; } } else { switch (TargetVT) { @@ -356,7 +789,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_ari; break; default: - return NULL; + return nullptr; } } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -385,7 +818,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_areg_64; break; default: - return NULL; + return nullptr; } } else { switch (TargetVT) { @@ -408,7 +841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { Opcode = NVPTX::LD_f64_areg; break; default: - return NULL; + return nullptr; } } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -417,7 +850,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops); } - if (NVPTXLD != NULL) { + if (NVPTXLD) { MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1); @@ -438,7 +871,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { EVT LoadedVT = MemSD->getMemoryVT(); if (!LoadedVT.isSimple()) - return NULL; + return nullptr; // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget); @@ -484,7 +917,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { VecType = NVPTX::PTXLdStInstCode::V4; break; default: - return NULL; + return nullptr; } EVT EltVT = N->getValueType(0); @@ -492,11 +925,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_avar; break; @@ -520,7 +953,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_avar; break; @@ -546,11 +979,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_asi; break; @@ -574,7 +1007,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_asi; break; @@ -601,11 +1034,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari_64; break; @@ -629,7 +1062,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari_64; break; @@ -648,11 +1081,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari; break; @@ -676,7 +1109,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari; break; @@ -703,11 +1136,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg_64; break; @@ -731,7 +1164,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg_64; break; @@ -750,11 +1183,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg; break; @@ -778,7 +1211,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { case NVPTXISD::LoadV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg; break; @@ -809,26 +1242,105 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { return LD; } -SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { +SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); + SDValue Op1; + MemSDNode *Mem; + bool IsLDG = true; + + // If this is an LDG intrinsic, the address is the third operand. Its its an + // LDG/LDU SD node (from custom vector handling), then its the second operand + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + Op1 = N->getOperand(2); + Mem = cast<MemIntrinsicSDNode>(N); + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + IsLDG = true; + break; + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + IsLDG = false; + break; + } + } else { + Op1 = N->getOperand(1); + Mem = cast<MemSDNode>(N); + } + unsigned Opcode; SDLoc DL(N); SDNode *LD; - MemSDNode *Mem = cast<MemSDNode>(N); SDValue Base, Offset, Addr; - EVT EltVT = Mem->getMemoryVT().getVectorElementType(); + EVT EltVT = Mem->getMemoryVT(); + if (EltVT.isVector()) { + EltVT = EltVT.getVectorElementType(); + } if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar; break; @@ -852,7 +1364,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar; break; @@ -876,7 +1388,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar; break; @@ -894,7 +1406,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar; break; @@ -912,19 +1424,67 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } SDValue Ops[] = { Addr, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), - ArrayRef<SDValue>(Ops, 2)); + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); } else if (Subtarget.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64; break; @@ -948,7 +1508,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64; break; @@ -972,7 +1532,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64; break; @@ -990,7 +1550,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64; break; @@ -1009,11 +1569,60 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32; break; @@ -1037,7 +1646,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32; break; @@ -1061,7 +1670,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32; break; @@ -1079,7 +1688,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32; break; @@ -1099,17 +1708,65 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { SDValue Ops[] = { Base, Offset, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), - ArrayRef<SDValue>(Ops, 3)); + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); } else { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64; break; @@ -1133,7 +1790,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64; break; @@ -1157,7 +1814,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64; break; @@ -1175,7 +1832,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64; break; @@ -1194,11 +1851,60 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32; break; @@ -1222,7 +1928,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32; break; @@ -1246,7 +1952,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32; break; @@ -1264,7 +1970,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { case NVPTXISD::LDUV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32; break; @@ -1283,12 +1989,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } SDValue Ops[] = { Op1, Chain }; - LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), - ArrayRef<SDValue>(Ops, 2)); + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + MemRefs0[0] = Mem->getMemOperand(); cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); return LD; @@ -1298,14 +2003,14 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { SDLoc dl(N); StoreSDNode *ST = cast<StoreSDNode>(N); EVT StoreVT = ST->getMemoryVT(); - SDNode *NVPTXST = NULL; + SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec if (ST->isIndexed()) - return NULL; + return nullptr; if (!StoreVT.isSimple()) - return NULL; + return nullptr; // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget); @@ -1328,7 +2033,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { else if (num == 4) vecType = NVPTX::PTXLdStInstCode::V4; else - return NULL; + return nullptr; } // Type Setting: toType + toTypeWidth @@ -1372,7 +2077,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_avar; break; default: - return NULL; + return nullptr; } SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace), getI32Imm(vecType), getI32Imm(toType), @@ -1401,7 +2106,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_asi; break; default: - return NULL; + return nullptr; } SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace), getI32Imm(vecType), getI32Imm(toType), @@ -1431,7 +2136,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_ari_64; break; default: - return NULL; + return nullptr; } } else { switch (SourceVT) { @@ -1454,7 +2159,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_ari; break; default: - return NULL; + return nullptr; } } SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -1483,7 +2188,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_areg_64; break; default: - return NULL; + return nullptr; } } else { switch (SourceVT) { @@ -1506,7 +2211,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { Opcode = NVPTX::ST_f64_areg; break; default: - return NULL; + return nullptr; } } SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -1515,7 +2220,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) { NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops); } - if (NVPTXST != NULL) { + if (NVPTXST) { MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1); @@ -1582,7 +2287,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { N2 = N->getOperand(5); break; default: - return NULL; + return nullptr; } StOps.push_back(getI32Imm(IsVolatile)); @@ -1594,11 +2299,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_avar; break; @@ -1622,7 +2327,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_avar; break; @@ -1644,11 +2349,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_asi; break; @@ -1672,7 +2377,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_asi; break; @@ -1696,11 +2401,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari_64; break; @@ -1724,7 +2429,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari_64; break; @@ -1743,11 +2448,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari; break; @@ -1771,7 +2476,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari; break; @@ -1794,11 +2499,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { if (Subtarget.is64Bit()) { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg_64; break; @@ -1822,7 +2527,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg_64; break; @@ -1841,11 +2546,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { } else { switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreV2: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg; break; @@ -1869,7 +2574,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { case NVPTXISD::StoreV4: switch (EltVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg; break; @@ -1910,7 +2615,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { unsigned VecSize; switch (Node->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::LoadParam: VecSize = 1; break; @@ -1929,11 +2634,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { switch (VecSize) { default: - return NULL; + return nullptr; case 1: switch (MemVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opc = NVPTX::LoadParamMemI8; break; @@ -1960,7 +2665,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { case 2: switch (MemVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opc = NVPTX::LoadParamMemV2I8; break; @@ -1987,7 +2692,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { case 4: switch (MemVT.getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opc = NVPTX::LoadParamMemV4I8; break; @@ -2014,7 +2719,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); } else { EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(&EVTs[0], 5); + VTs = CurDAG->getVTList(EVTs); } unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); @@ -2040,7 +2745,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { unsigned NumElts = 1; switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreRetval: NumElts = 1; break; @@ -2065,11 +2770,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { unsigned Opcode = 0; switch (NumElts) { default: - return NULL; + return nullptr; case 1: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreRetvalI8; break; @@ -2096,7 +2801,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { case 2: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreRetvalV2I8; break; @@ -2123,7 +2828,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { case 4: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreRetvalV4I8; break; @@ -2166,7 +2871,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { unsigned NumElts = 1; switch (N->getOpcode()) { default: - return NULL; + return nullptr; case NVPTXISD::StoreParamU32: case NVPTXISD::StoreParamS32: case NVPTXISD::StoreParam: @@ -2197,11 +2902,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { default: switch (NumElts) { default: - return NULL; + return nullptr; case 1: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreParamI8; break; @@ -2228,7 +2933,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { case 2: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreParamV2I8; break; @@ -2255,7 +2960,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { case 4: switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { default: - return NULL; + return nullptr; case MVT::i1: Opcode = NVPTX::StoreParamV4I8; break; @@ -2308,6 +3013,1940 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { return Ret; } +SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDNode *Ret = nullptr; + unsigned Opc = 0; + SmallVector<SDValue, 8> Ops; + + switch (N->getOpcode()) { + default: return nullptr; + case NVPTXISD::Tex1DFloatS32: + Opc = NVPTX::TEX_1D_F32_S32; + break; + case NVPTXISD::Tex1DFloatFloat: + Opc = NVPTX::TEX_1D_F32_F32; + break; + case NVPTXISD::Tex1DFloatFloatLevel: + Opc = NVPTX::TEX_1D_F32_F32_LEVEL; + break; + case NVPTXISD::Tex1DFloatFloatGrad: + Opc = NVPTX::TEX_1D_F32_F32_GRAD; + break; + case NVPTXISD::Tex1DS32S32: + Opc = NVPTX::TEX_1D_S32_S32; + break; + case NVPTXISD::Tex1DS32Float: + Opc = NVPTX::TEX_1D_S32_F32; + break; + case NVPTXISD::Tex1DS32FloatLevel: + Opc = NVPTX::TEX_1D_S32_F32_LEVEL; + break; + case NVPTXISD::Tex1DS32FloatGrad: + Opc = NVPTX::TEX_1D_S32_F32_GRAD; + break; + case NVPTXISD::Tex1DU32S32: + Opc = NVPTX::TEX_1D_U32_S32; + break; + case NVPTXISD::Tex1DU32Float: + Opc = NVPTX::TEX_1D_U32_F32; + break; + case NVPTXISD::Tex1DU32FloatLevel: + Opc = NVPTX::TEX_1D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex1DU32FloatGrad: + Opc = NVPTX::TEX_1D_U32_F32_GRAD; + break; + case NVPTXISD::Tex1DArrayFloatS32: + Opc = NVPTX::TEX_1D_ARRAY_F32_S32; + break; + case NVPTXISD::Tex1DArrayFloatFloat: + Opc = NVPTX::TEX_1D_ARRAY_F32_F32; + break; + case NVPTXISD::Tex1DArrayFloatFloatLevel: + Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::Tex1DArrayFloatFloatGrad: + Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD; + break; + case NVPTXISD::Tex1DArrayS32S32: + Opc = NVPTX::TEX_1D_ARRAY_S32_S32; + break; + case NVPTXISD::Tex1DArrayS32Float: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32; + break; + case NVPTXISD::Tex1DArrayS32FloatLevel: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::Tex1DArrayS32FloatGrad: + Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::Tex1DArrayU32S32: + Opc = NVPTX::TEX_1D_ARRAY_U32_S32; + break; + case NVPTXISD::Tex1DArrayU32Float: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32; + break; + case NVPTXISD::Tex1DArrayU32FloatLevel: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tex1DArrayU32FloatGrad: + Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::Tex2DFloatS32: + Opc = NVPTX::TEX_2D_F32_S32; + break; + case NVPTXISD::Tex2DFloatFloat: + Opc = NVPTX::TEX_2D_F32_F32; + break; + case NVPTXISD::Tex2DFloatFloatLevel: + Opc = NVPTX::TEX_2D_F32_F32_LEVEL; + break; + case NVPTXISD::Tex2DFloatFloatGrad: + Opc = NVPTX::TEX_2D_F32_F32_GRAD; + break; + case NVPTXISD::Tex2DS32S32: + Opc = NVPTX::TEX_2D_S32_S32; + break; + case NVPTXISD::Tex2DS32Float: + Opc = NVPTX::TEX_2D_S32_F32; + break; + case NVPTXISD::Tex2DS32FloatLevel: + Opc = NVPTX::TEX_2D_S32_F32_LEVEL; + break; + case NVPTXISD::Tex2DS32FloatGrad: + Opc = NVPTX::TEX_2D_S32_F32_GRAD; + break; + case NVPTXISD::Tex2DU32S32: + Opc = NVPTX::TEX_2D_U32_S32; + break; + case NVPTXISD::Tex2DU32Float: + Opc = NVPTX::TEX_2D_U32_F32; + break; + case NVPTXISD::Tex2DU32FloatLevel: + Opc = NVPTX::TEX_2D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex2DU32FloatGrad: + Opc = NVPTX::TEX_2D_U32_F32_GRAD; + break; + case NVPTXISD::Tex2DArrayFloatS32: + Opc = NVPTX::TEX_2D_ARRAY_F32_S32; + break; + case NVPTXISD::Tex2DArrayFloatFloat: + Opc = NVPTX::TEX_2D_ARRAY_F32_F32; + break; + case NVPTXISD::Tex2DArrayFloatFloatLevel: + Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::Tex2DArrayFloatFloatGrad: + Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD; + break; + case NVPTXISD::Tex2DArrayS32S32: + Opc = NVPTX::TEX_2D_ARRAY_S32_S32; + break; + case NVPTXISD::Tex2DArrayS32Float: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32; + break; + case NVPTXISD::Tex2DArrayS32FloatLevel: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::Tex2DArrayS32FloatGrad: + Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::Tex2DArrayU32S32: + Opc = NVPTX::TEX_2D_ARRAY_U32_S32; + break; + case NVPTXISD::Tex2DArrayU32Float: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32; + break; + case NVPTXISD::Tex2DArrayU32FloatLevel: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tex2DArrayU32FloatGrad: + Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::Tex3DFloatS32: + Opc = NVPTX::TEX_3D_F32_S32; + break; + case NVPTXISD::Tex3DFloatFloat: + Opc = NVPTX::TEX_3D_F32_F32; + break; + case NVPTXISD::Tex3DFloatFloatLevel: + Opc = NVPTX::TEX_3D_F32_F32_LEVEL; + break; + case NVPTXISD::Tex3DFloatFloatGrad: + Opc = NVPTX::TEX_3D_F32_F32_GRAD; + break; + case NVPTXISD::Tex3DS32S32: + Opc = NVPTX::TEX_3D_S32_S32; + break; + case NVPTXISD::Tex3DS32Float: + Opc = NVPTX::TEX_3D_S32_F32; + break; + case NVPTXISD::Tex3DS32FloatLevel: + Opc = NVPTX::TEX_3D_S32_F32_LEVEL; + break; + case NVPTXISD::Tex3DS32FloatGrad: + Opc = NVPTX::TEX_3D_S32_F32_GRAD; + break; + case NVPTXISD::Tex3DU32S32: + Opc = NVPTX::TEX_3D_U32_S32; + break; + case NVPTXISD::Tex3DU32Float: + Opc = NVPTX::TEX_3D_U32_F32; + break; + case NVPTXISD::Tex3DU32FloatLevel: + Opc = NVPTX::TEX_3D_U32_F32_LEVEL; + break; + case NVPTXISD::Tex3DU32FloatGrad: + Opc = NVPTX::TEX_3D_U32_F32_GRAD; + break; + case NVPTXISD::TexCubeFloatFloat: + Opc = NVPTX::TEX_CUBE_F32_F32; + break; + case NVPTXISD::TexCubeFloatFloatLevel: + Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL; + break; + case NVPTXISD::TexCubeS32Float: + Opc = NVPTX::TEX_CUBE_S32_F32; + break; + case NVPTXISD::TexCubeS32FloatLevel: + Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL; + break; + case NVPTXISD::TexCubeU32Float: + Opc = NVPTX::TEX_CUBE_U32_F32; + break; + case NVPTXISD::TexCubeU32FloatLevel: + Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayFloatFloat: + Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32; + break; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayS32Float: + Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32; + break; + case NVPTXISD::TexCubeArrayS32FloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexCubeArrayU32Float: + Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32; + break; + case NVPTXISD::TexCubeArrayU32FloatLevel: + Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tld4R2DFloatFloat: + Opc = NVPTX::TLD4_R_2D_F32_F32; + break; + case NVPTXISD::Tld4G2DFloatFloat: + Opc = NVPTX::TLD4_G_2D_F32_F32; + break; + case NVPTXISD::Tld4B2DFloatFloat: + Opc = NVPTX::TLD4_B_2D_F32_F32; + break; + case NVPTXISD::Tld4A2DFloatFloat: + Opc = NVPTX::TLD4_A_2D_F32_F32; + break; + case NVPTXISD::Tld4R2DS64Float: + Opc = NVPTX::TLD4_R_2D_S32_F32; + break; + case NVPTXISD::Tld4G2DS64Float: + Opc = NVPTX::TLD4_G_2D_S32_F32; + break; + case NVPTXISD::Tld4B2DS64Float: + Opc = NVPTX::TLD4_B_2D_S32_F32; + break; + case NVPTXISD::Tld4A2DS64Float: + Opc = NVPTX::TLD4_A_2D_S32_F32; + break; + case NVPTXISD::Tld4R2DU64Float: + Opc = NVPTX::TLD4_R_2D_U32_F32; + break; + case NVPTXISD::Tld4G2DU64Float: + Opc = NVPTX::TLD4_G_2D_U32_F32; + break; + case NVPTXISD::Tld4B2DU64Float: + Opc = NVPTX::TLD4_B_2D_U32_F32; + break; + case NVPTXISD::Tld4A2DU64Float: + Opc = NVPTX::TLD4_A_2D_U32_F32; + break; + case NVPTXISD::TexUnified1DFloatS32: + Opc = NVPTX::TEX_UNIFIED_1D_F32_S32; + break; + case NVPTXISD::TexUnified1DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32; + break; + case NVPTXISD::TexUnified1DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DS32S32: + Opc = NVPTX::TEX_UNIFIED_1D_S32_S32; + break; + case NVPTXISD::TexUnified1DS32Float: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32; + break; + case NVPTXISD::TexUnified1DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DU32S32: + Opc = NVPTX::TEX_UNIFIED_1D_U32_S32; + break; + case NVPTXISD::TexUnified1DU32Float: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32; + break; + case NVPTXISD::TexUnified1DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DArrayFloatS32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32; + break; + case NVPTXISD::TexUnified1DArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32; + break; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DArrayS32S32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32; + break; + case NVPTXISD::TexUnified1DArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified1DArrayU32S32: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32; + break; + case NVPTXISD::TexUnified1DArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DFloatS32: + Opc = NVPTX::TEX_UNIFIED_2D_F32_S32; + break; + case NVPTXISD::TexUnified2DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32; + break; + case NVPTXISD::TexUnified2DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DS32S32: + Opc = NVPTX::TEX_UNIFIED_2D_S32_S32; + break; + case NVPTXISD::TexUnified2DS32Float: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32; + break; + case NVPTXISD::TexUnified2DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DU32S32: + Opc = NVPTX::TEX_UNIFIED_2D_U32_S32; + break; + case NVPTXISD::TexUnified2DU32Float: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32; + break; + case NVPTXISD::TexUnified2DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayFloatS32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32; + break; + case NVPTXISD::TexUnified2DArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32; + break; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayS32S32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32; + break; + case NVPTXISD::TexUnified2DArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified2DArrayU32S32: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32; + break; + case NVPTXISD::TexUnified2DArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DFloatS32: + Opc = NVPTX::TEX_UNIFIED_3D_F32_S32; + break; + case NVPTXISD::TexUnified3DFloatFloat: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32; + break; + case NVPTXISD::TexUnified3DFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DFloatFloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DS32S32: + Opc = NVPTX::TEX_UNIFIED_3D_S32_S32; + break; + case NVPTXISD::TexUnified3DS32Float: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32; + break; + case NVPTXISD::TexUnified3DS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DS32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD; + break; + case NVPTXISD::TexUnified3DU32S32: + Opc = NVPTX::TEX_UNIFIED_3D_U32_S32; + break; + case NVPTXISD::TexUnified3DU32Float: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32; + break; + case NVPTXISD::TexUnified3DU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnified3DU32FloatGrad: + Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD; + break; + case NVPTXISD::TexUnifiedCubeFloatFloat: + Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32; + break; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeS32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32; + break; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeU32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32; + break; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL; + break; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32; + break; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL; + break; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32; + break; + case NVPTXISD::Tld4UnifiedR2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DS64Float: + Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32; + break; + case NVPTXISD::Tld4UnifiedR2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedG2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedB2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32; + break; + case NVPTXISD::Tld4UnifiedA2DU64Float: + Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32; + break; + } + + // Copy over operands + for (unsigned i = 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + + Ops.push_back(Chain); + Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + return Ret; +} + +SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue TexHandle = N->getOperand(1); + SDNode *Ret = nullptr; + unsigned Opc = 0; + SmallVector<SDValue, 8> Ops; + switch (N->getOpcode()) { + default: return nullptr; + case NVPTXISD::Suld1DI8Clamp: + Opc = NVPTX::SULD_1D_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI16Clamp: + Opc = NVPTX::SULD_1D_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI32Clamp: + Opc = NVPTX::SULD_1D_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI64Clamp: + Opc = NVPTX::SULD_1D_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I8Clamp: + Opc = NVPTX::SULD_1D_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I16Clamp: + Opc = NVPTX::SULD_1D_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I32Clamp: + Opc = NVPTX::SULD_1D_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I64Clamp: + Opc = NVPTX::SULD_1D_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I8Clamp: + Opc = NVPTX::SULD_1D_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I16Clamp: + Opc = NVPTX::SULD_1D_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I32Clamp: + Opc = NVPTX::SULD_1D_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI64Clamp: + Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I64Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I8Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I16Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I32Clamp: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI8Clamp: + Opc = NVPTX::SULD_2D_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI16Clamp: + Opc = NVPTX::SULD_2D_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI32Clamp: + Opc = NVPTX::SULD_2D_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI64Clamp: + Opc = NVPTX::SULD_2D_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I8Clamp: + Opc = NVPTX::SULD_2D_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I16Clamp: + Opc = NVPTX::SULD_2D_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I32Clamp: + Opc = NVPTX::SULD_2D_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Clamp: + Opc = NVPTX::SULD_2D_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Clamp: + Opc = NVPTX::SULD_2D_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Clamp: + Opc = NVPTX::SULD_2D_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Clamp: + Opc = NVPTX::SULD_2D_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI64Clamp: + Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I64Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I8Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I16Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I32Clamp: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI8Clamp: + Opc = NVPTX::SULD_3D_I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI16Clamp: + Opc = NVPTX::SULD_3D_I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI32Clamp: + Opc = NVPTX::SULD_3D_I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI64Clamp: + Opc = NVPTX::SULD_3D_I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I8Clamp: + Opc = NVPTX::SULD_3D_V2I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I16Clamp: + Opc = NVPTX::SULD_3D_V2I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Clamp: + Opc = NVPTX::SULD_3D_V2I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Clamp: + Opc = NVPTX::SULD_3D_V2I64_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Clamp: + Opc = NVPTX::SULD_3D_V4I8_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Clamp: + Opc = NVPTX::SULD_3D_V4I16_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Clamp: + Opc = NVPTX::SULD_3D_V4I32_CLAMP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI8Trap: + Opc = NVPTX::SULD_1D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI16Trap: + Opc = NVPTX::SULD_1D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI32Trap: + Opc = NVPTX::SULD_1D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI64Trap: + Opc = NVPTX::SULD_1D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I8Trap: + Opc = NVPTX::SULD_1D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I16Trap: + Opc = NVPTX::SULD_1D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I32Trap: + Opc = NVPTX::SULD_1D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I64Trap: + Opc = NVPTX::SULD_1D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I8Trap: + Opc = NVPTX::SULD_1D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I16Trap: + Opc = NVPTX::SULD_1D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I32Trap: + Opc = NVPTX::SULD_1D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI8Trap: + Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI16Trap: + Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI32Trap: + Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI64Trap: + Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I8Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I16Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I32Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I64Trap: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I8Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I16Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I32Trap: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI8Trap: + Opc = NVPTX::SULD_2D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI16Trap: + Opc = NVPTX::SULD_2D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI32Trap: + Opc = NVPTX::SULD_2D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI64Trap: + Opc = NVPTX::SULD_2D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I8Trap: + Opc = NVPTX::SULD_2D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I16Trap: + Opc = NVPTX::SULD_2D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I32Trap: + Opc = NVPTX::SULD_2D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Trap: + Opc = NVPTX::SULD_2D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Trap: + Opc = NVPTX::SULD_2D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Trap: + Opc = NVPTX::SULD_2D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Trap: + Opc = NVPTX::SULD_2D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Trap: + Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Trap: + Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI32Trap: + Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI64Trap: + Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I8Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I16Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I32Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I64Trap: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I8Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I16Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I32Trap: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI8Trap: + Opc = NVPTX::SULD_3D_I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI16Trap: + Opc = NVPTX::SULD_3D_I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI32Trap: + Opc = NVPTX::SULD_3D_I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI64Trap: + Opc = NVPTX::SULD_3D_I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I8Trap: + Opc = NVPTX::SULD_3D_V2I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I16Trap: + Opc = NVPTX::SULD_3D_V2I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Trap: + Opc = NVPTX::SULD_3D_V2I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Trap: + Opc = NVPTX::SULD_3D_V2I64_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Trap: + Opc = NVPTX::SULD_3D_V4I8_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Trap: + Opc = NVPTX::SULD_3D_V4I16_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Trap: + Opc = NVPTX::SULD_3D_V4I32_TRAP; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI8Zero: + Opc = NVPTX::SULD_1D_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI16Zero: + Opc = NVPTX::SULD_1D_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI32Zero: + Opc = NVPTX::SULD_1D_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DI64Zero: + Opc = NVPTX::SULD_1D_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I8Zero: + Opc = NVPTX::SULD_1D_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I16Zero: + Opc = NVPTX::SULD_1D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I32Zero: + Opc = NVPTX::SULD_1D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV2I64Zero: + Opc = NVPTX::SULD_1D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I8Zero: + Opc = NVPTX::SULD_1D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I16Zero: + Opc = NVPTX::SULD_1D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DV4I32Zero: + Opc = NVPTX::SULD_1D_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI8Zero: + Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI16Zero: + Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI32Zero: + Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayI64Zero: + Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I8Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I16Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I32Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV2I64Zero: + Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I8Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I16Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld1DArrayV4I32Zero: + Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI8Zero: + Opc = NVPTX::SULD_2D_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI16Zero: + Opc = NVPTX::SULD_2D_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI32Zero: + Opc = NVPTX::SULD_2D_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DI64Zero: + Opc = NVPTX::SULD_2D_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I8Zero: + Opc = NVPTX::SULD_2D_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I16Zero: + Opc = NVPTX::SULD_2D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I32Zero: + Opc = NVPTX::SULD_2D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV2I64Zero: + Opc = NVPTX::SULD_2D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I8Zero: + Opc = NVPTX::SULD_2D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I16Zero: + Opc = NVPTX::SULD_2D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DV4I32Zero: + Opc = NVPTX::SULD_2D_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI8Zero: + Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI16Zero: + Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI32Zero: + Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayI64Zero: + Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I8Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I16Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I32Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV2I64Zero: + Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I8Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I16Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld2DArrayV4I32Zero: + Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI8Zero: + Opc = NVPTX::SULD_3D_I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI16Zero: + Opc = NVPTX::SULD_3D_I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI32Zero: + Opc = NVPTX::SULD_3D_I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DI64Zero: + Opc = NVPTX::SULD_3D_I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I8Zero: + Opc = NVPTX::SULD_3D_V2I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I16Zero: + Opc = NVPTX::SULD_3D_V2I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I32Zero: + Opc = NVPTX::SULD_3D_V2I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV2I64Zero: + Opc = NVPTX::SULD_3D_V2I64_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I8Zero: + Opc = NVPTX::SULD_3D_V4I8_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I16Zero: + Opc = NVPTX::SULD_3D_V4I16_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + case NVPTXISD::Suld3DV4I32Zero: + Opc = NVPTX::SULD_3D_V4I32_ZERO; + Ops.push_back(TexHandle); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); + Ops.push_back(Chain); + break; + } + Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + return Ret; +} + + +/// SelectBFE - Look for instruction sequences that can be made more efficient +/// by using the 'bfe' (bit-field extract) PTX instruction +SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Len; + SDValue Start; + SDValue Val; + bool IsSigned = false; + + if (N->getOpcode() == ISD::AND) { + // Canonicalize the operands + // We want 'and %val, %mask' + if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { + std::swap(LHS, RHS); + } + + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); + if (!Mask) { + // We need a constant mask on the RHS of the AND + return NULL; + } + + // Extract the mask bits + uint64_t MaskVal = Mask->getZExtValue(); + if (!isMask_64(MaskVal)) { + // We *could* handle shifted masks here, but doing so would require an + // 'and' operation to fix up the low-order bits so we would trade + // shr+and for bfe+and, which has the same throughput + return NULL; + } + + // How many bits are in our mask? + uint64_t NumBits = CountTrailingOnes_64(MaskVal); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + + if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { + // We have a 'srl/and' pair, extract the effective start bit and length + Val = LHS.getNode()->getOperand(0); + Start = LHS.getNode()->getOperand(1); + ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); + if (StartConst) { + uint64_t StartVal = StartConst->getZExtValue(); + // How many "good" bits do we have left? "good" is defined here as bits + // that exist in the original value, not shifted in. + uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal; + if (NumBits > GoodBits) { + // Do not handle the case where bits have been shifted in. In theory + // we could handle this, but the cost is likely higher than just + // emitting the srl/and pair. + return NULL; + } + Start = CurDAG->getTargetConstant(StartVal, MVT::i32); + } else { + // Do not handle the case where the shift amount (can be zero if no srl + // was found) is not constant. We could handle this case, but it would + // require run-time logic that would be more expensive than just + // emitting the srl/and pair. + return NULL; + } + } else { + // Do not handle the case where the LHS of the and is not a shift. While + // it would be trivial to handle this case, it would just transform + // 'and' -> 'bfe', but 'and' has higher-throughput. + return NULL; + } + } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { + if (LHS->getOpcode() == ISD::AND) { + ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); + if (!ShiftCnst) { + // Shift amount must be constant + return NULL; + } + + uint64_t ShiftAmt = ShiftCnst->getZExtValue(); + + SDValue AndLHS = LHS->getOperand(0); + SDValue AndRHS = LHS->getOperand(1); + + // Canonicalize the AND to have the mask on the RHS + if (isa<ConstantSDNode>(AndLHS)) { + std::swap(AndLHS, AndRHS); + } + + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); + if (!MaskCnst) { + // Mask must be constant + return NULL; + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + uint64_t NumZeros; + uint64_t NumBits; + if (isMask_64(MaskVal)) { + NumZeros = 0; + // The number of bits in the result bitfield will be the number of + // trailing ones (the AND) minus the number of bits we shift off + NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt; + } else if (isShiftedMask_64(MaskVal)) { + NumZeros = countTrailingZeros(MaskVal); + unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros); + // The number of bits in the result bitfield will be the number of + // trailing zeros plus the number of set bits in the mask minus the + // number of bits we shift off + NumBits = NumZeros + NumOnes - ShiftAmt; + } else { + // This is not a mask we can handle + return NULL; + } + + if (ShiftAmt < NumZeros) { + // Handling this case would require extra logic that would make this + // transformation non-profitable + return NULL; + } + + Val = AndLHS; + Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + } else if (LHS->getOpcode() == ISD::SHL) { + // Here, we have a pattern like: + // + // (sra (shl val, NN), MM) + // or + // (srl (shl val, NN), MM) + // + // If MM >= NN, we can efficiently optimize this with bfe + Val = LHS->getOperand(0); + + SDValue ShlRHS = LHS->getOperand(1); + ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); + if (!ShlCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); + + SDValue ShrRHS = RHS; + ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); + if (!ShrCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); + + // To avoid extra codegen and be profitable, we need Outer >= Inner + if (OuterShiftAmt < InnerShiftAmt) { + return NULL; + } + + // If the outer shift is more than the type size, we have no bitfield to + // extract (since we also check that the inner shift is <= the outer shift + // then this also implies that the inner shift is < the type size) + if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) { + return NULL; + } + + Start = + CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32); + Len = + CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() - + OuterShiftAmt, MVT::i32); + + if (N->getOpcode() == ISD::SRA) { + // If we have a arithmetic right shift, we need to use the signed bfe + // variant + IsSigned = true; + } + } else { + // No can do... + return NULL; + } + } else { + // No can do... + return NULL; + } + + + unsigned Opc; + // For the BFE operations we form here from "and" and "srl", always use the + // unsigned variants. + if (Val.getValueType() == MVT::i32) { + if (IsSigned) { + Opc = NVPTX::BFE_S32rii; + } else { + Opc = NVPTX::BFE_U32rii; + } + } else if (Val.getValueType() == MVT::i64) { + if (IsSigned) { + Opc = NVPTX::BFE_S64rii; + } else { + Opc = NVPTX::BFE_U64rii; + } + } else { + // We cannot handle this type + return NULL; + } + + SDValue Ops[] = { + Val, Start, Len + }; + + SDNode *Ret = + CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + + return Ret; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { @@ -2401,14 +5040,18 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const { - const Value *Src = NULL; + const Value *Src = nullptr; // Even though MemIntrinsicSDNode is a subclas of MemSDNode, // the classof() for MemSDNode does not include MemIntrinsicSDNode // (See SelectionDAGNodes.h). So we need to check for both. if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { - Src = mN->getSrcValue(); + if (spN == 0 && mN->getMemOperand()->getPseudoValue()) + return true; + Src = mN->getMemOperand()->getValue(); } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) { - Src = mN->getSrcValue(); + if (spN == 0 && mN->getMemOperand()->getPseudoValue()) + return true; + Src = mN->getMemOperand()->getValue(); } if (!Src) return false; @@ -2440,24 +5083,3 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( } return true; } - -// Return true if N is a undef or a constant. -// If N was undef, return a (i8imm 0) in Retval -// If N was imm, convert it to i8imm and return in Retval -// Note: The convert to i8imm is required, otherwise the -// pattern matcher inserts a bunch of IMOVi8rr to convert -// the imm to i8imm, and this causes instruction selection -// to fail. -bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N, SDValue &Retval) { - if (!(N.getOpcode() == ISD::UNDEF) && !(N.getOpcode() == ISD::Constant)) - return false; - - if (N.getOpcode() == ISD::UNDEF) - Retval = CurDAG->getTargetConstant(0, MVT::i8); - else { - ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode()); - unsigned retval = cn->getZExtValue(); - Retval = CurDAG->getTargetConstant(retval, MVT::i8); - } - return true; -} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index d961e5014532..c62fc253c33d 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -11,8 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "nvptx-isel" - #include "NVPTX.h" #include "NVPTXISelLowering.h" #include "NVPTXRegisterInfo.h" @@ -26,47 +24,48 @@ namespace { class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { - // If true, generate corresponding FPCONTRACT. This is - // language dependent (i.e. CUDA and OpenCL works differently). - bool doFMAF64; - bool doFMAF32; - bool doFMAF64AGG; - bool doFMAF32AGG; - bool allowFMA; - // If true, generate mul.wide from sext and mul bool doMulWide; int getDivF32Level() const; bool usePrecSqrtF32() const; bool useF32FTZ() const; + bool allowFMA() const; public: explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel); // Pass Name - virtual const char *getPassName() const { + const char *getPassName() const override { return "NVPTX DAG->DAG Pattern Instruction Selection"; } const NVPTXSubtarget &Subtarget; - virtual bool SelectInlineAsmMemoryOperand( - const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps); + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps) override; private: // Include the pieces autogenerated from the target description. #include "NVPTXGenDAGISel.inc" - SDNode *Select(SDNode *N); + SDNode *Select(SDNode *N) override; + SDNode *SelectIntrinsicNoChain(SDNode *N); + SDNode *SelectIntrinsicChain(SDNode *N); + SDNode *SelectTexSurfHandle(SDNode *N); SDNode *SelectLoad(SDNode *N); SDNode *SelectLoadVector(SDNode *N); - SDNode *SelectLDGLDUVector(SDNode *N); + SDNode *SelectLDGLDU(SDNode *N); SDNode *SelectStore(SDNode *N); SDNode *SelectStoreVector(SDNode *N); SDNode *SelectLoadParam(SDNode *N); SDNode *SelectStoreRetval(SDNode *N); SDNode *SelectStoreParam(SDNode *N); + SDNode *SelectAddrSpaceCast(SDNode *N); + SDNode *SelectTextureIntrinsic(SDNode *N); + SDNode *SelectSurfaceIntrinsic(SDNode *N); + SDNode *SelectBFE(SDNode *N); inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); @@ -91,7 +90,5 @@ private: bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; - bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval); - }; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 6a8be753c87c..d76b20a29eb7 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -29,10 +30,10 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <sstream> @@ -47,6 +48,12 @@ static cl::opt<bool> sched4reg( "nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); +static cl::opt<unsigned> +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); + static bool IsPTXVectorType(MVT VT) { switch (VT.SimpleTy) { default: @@ -75,7 +82,7 @@ static bool IsPTXVectorType(MVT VT) { /// LowerCall, and LowerReturn. static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<uint64_t> *Offsets = 0, + SmallVectorImpl<uint64_t> *Offsets = nullptr, uint64_t StartingOffset = 0) { SmallVector<EVT, 16> TempVTs; SmallVector<uint64_t, 16> TempOffsets; @@ -111,6 +118,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Jump is Expensive. Don't create extra control flow for 'and', 'or' // condition branches. @@ -130,7 +138,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); // Operations not directly supported by NVPTX. - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -146,6 +160,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + if (nvptxSubtarget.hasROT64()) { setOperationAction(ISD::ROTL, MVT::i64, Legal); setOperationAction(ISD::ROTR, MVT::i64, Legal); @@ -182,8 +203,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); // Turn FP extload into load/fextend + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // PTX does not support load / store predicate registers @@ -237,6 +261,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + // Now deduce the information based on the above mentioned // actions computeRegisterProperties(); @@ -245,7 +276,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: - return 0; + return nullptr; case NVPTXISD::CALL: return "NVPTXISD::CALL"; case NVPTXISD::RET_FLAG: @@ -328,11 +359,509 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::StoreV2"; case NVPTXISD::StoreV4: return "NVPTXISD::StoreV4"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; + case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; + case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; + case NVPTXISD::Tex1DFloatFloatLevel: + return "NVPTXISD::Tex1DFloatFloatLevel"; + case NVPTXISD::Tex1DFloatFloatGrad: + return "NVPTXISD::Tex1DFloatFloatGrad"; + case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; + case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; + case NVPTXISD::Tex1DS32FloatLevel: + return "NVPTXISD::Tex1DS32FloatLevel"; + case NVPTXISD::Tex1DS32FloatGrad: + return "NVPTXISD::Tex1DS32FloatGrad"; + case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; + case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; + case NVPTXISD::Tex1DU32FloatLevel: + return "NVPTXISD::Tex1DU32FloatLevel"; + case NVPTXISD::Tex1DU32FloatGrad: + return "NVPTXISD::Tex1DU32FloatGrad"; + case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; + case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; + case NVPTXISD::Tex1DArrayFloatFloatLevel: + return "NVPTXISD::Tex1DArrayFloatFloatLevel"; + case NVPTXISD::Tex1DArrayFloatFloatGrad: + return "NVPTXISD::Tex1DArrayFloatFloatGrad"; + case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; + case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; + case NVPTXISD::Tex1DArrayS32FloatLevel: + return "NVPTXISD::Tex1DArrayS32FloatLevel"; + case NVPTXISD::Tex1DArrayS32FloatGrad: + return "NVPTXISD::Tex1DArrayS32FloatGrad"; + case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; + case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; + case NVPTXISD::Tex1DArrayU32FloatLevel: + return "NVPTXISD::Tex1DArrayU32FloatLevel"; + case NVPTXISD::Tex1DArrayU32FloatGrad: + return "NVPTXISD::Tex1DArrayU32FloatGrad"; + case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; + case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; + case NVPTXISD::Tex2DFloatFloatLevel: + return "NVPTXISD::Tex2DFloatFloatLevel"; + case NVPTXISD::Tex2DFloatFloatGrad: + return "NVPTXISD::Tex2DFloatFloatGrad"; + case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; + case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; + case NVPTXISD::Tex2DS32FloatLevel: + return "NVPTXISD::Tex2DS32FloatLevel"; + case NVPTXISD::Tex2DS32FloatGrad: + return "NVPTXISD::Tex2DS32FloatGrad"; + case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; + case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; + case NVPTXISD::Tex2DU32FloatLevel: + return "NVPTXISD::Tex2DU32FloatLevel"; + case NVPTXISD::Tex2DU32FloatGrad: + return "NVPTXISD::Tex2DU32FloatGrad"; + case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; + case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; + case NVPTXISD::Tex2DArrayFloatFloatLevel: + return "NVPTXISD::Tex2DArrayFloatFloatLevel"; + case NVPTXISD::Tex2DArrayFloatFloatGrad: + return "NVPTXISD::Tex2DArrayFloatFloatGrad"; + case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; + case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; + case NVPTXISD::Tex2DArrayS32FloatLevel: + return "NVPTXISD::Tex2DArrayS32FloatLevel"; + case NVPTXISD::Tex2DArrayS32FloatGrad: + return "NVPTXISD::Tex2DArrayS32FloatGrad"; + case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; + case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; + case NVPTXISD::Tex2DArrayU32FloatLevel: + return "NVPTXISD::Tex2DArrayU32FloatLevel"; + case NVPTXISD::Tex2DArrayU32FloatGrad: + return "NVPTXISD::Tex2DArrayU32FloatGrad"; + case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; + case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; + case NVPTXISD::Tex3DFloatFloatLevel: + return "NVPTXISD::Tex3DFloatFloatLevel"; + case NVPTXISD::Tex3DFloatFloatGrad: + return "NVPTXISD::Tex3DFloatFloatGrad"; + case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; + case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; + case NVPTXISD::Tex3DS32FloatLevel: + return "NVPTXISD::Tex3DS32FloatLevel"; + case NVPTXISD::Tex3DS32FloatGrad: + return "NVPTXISD::Tex3DS32FloatGrad"; + case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; + case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; + case NVPTXISD::Tex3DU32FloatLevel: + return "NVPTXISD::Tex3DU32FloatLevel"; + case NVPTXISD::Tex3DU32FloatGrad: + return "NVPTXISD::Tex3DU32FloatGrad"; + case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; + case NVPTXISD::TexCubeFloatFloatLevel: + return "NVPTXISD::TexCubeFloatFloatLevel"; + case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; + case NVPTXISD::TexCubeS32FloatLevel: + return "NVPTXISD::TexCubeS32FloatLevel"; + case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; + case NVPTXISD::TexCubeU32FloatLevel: + return "NVPTXISD::TexCubeU32FloatLevel"; + case NVPTXISD::TexCubeArrayFloatFloat: + return "NVPTXISD::TexCubeArrayFloatFloat"; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + return "NVPTXISD::TexCubeArrayFloatFloatLevel"; + case NVPTXISD::TexCubeArrayS32Float: + return "NVPTXISD::TexCubeArrayS32Float"; + case NVPTXISD::TexCubeArrayS32FloatLevel: + return "NVPTXISD::TexCubeArrayS32FloatLevel"; + case NVPTXISD::TexCubeArrayU32Float: + return "NVPTXISD::TexCubeArrayU32Float"; + case NVPTXISD::TexCubeArrayU32FloatLevel: + return "NVPTXISD::TexCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4R2DFloatFloat: + return "NVPTXISD::Tld4R2DFloatFloat"; + case NVPTXISD::Tld4G2DFloatFloat: + return "NVPTXISD::Tld4G2DFloatFloat"; + case NVPTXISD::Tld4B2DFloatFloat: + return "NVPTXISD::Tld4B2DFloatFloat"; + case NVPTXISD::Tld4A2DFloatFloat: + return "NVPTXISD::Tld4A2DFloatFloat"; + case NVPTXISD::Tld4R2DS64Float: + return "NVPTXISD::Tld4R2DS64Float"; + case NVPTXISD::Tld4G2DS64Float: + return "NVPTXISD::Tld4G2DS64Float"; + case NVPTXISD::Tld4B2DS64Float: + return "NVPTXISD::Tld4B2DS64Float"; + case NVPTXISD::Tld4A2DS64Float: + return "NVPTXISD::Tld4A2DS64Float"; + case NVPTXISD::Tld4R2DU64Float: + return "NVPTXISD::Tld4R2DU64Float"; + case NVPTXISD::Tld4G2DU64Float: + return "NVPTXISD::Tld4G2DU64Float"; + case NVPTXISD::Tld4B2DU64Float: + return "NVPTXISD::Tld4B2DU64Float"; + case NVPTXISD::Tld4A2DU64Float: + return "NVPTXISD::Tld4A2DU64Float"; + + case NVPTXISD::TexUnified1DFloatS32: + return "NVPTXISD::TexUnified1DFloatS32"; + case NVPTXISD::TexUnified1DFloatFloat: + return "NVPTXISD::TexUnified1DFloatFloat"; + case NVPTXISD::TexUnified1DFloatFloatLevel: + return "NVPTXISD::TexUnified1DFloatFloatLevel"; + case NVPTXISD::TexUnified1DFloatFloatGrad: + return "NVPTXISD::TexUnified1DFloatFloatGrad"; + case NVPTXISD::TexUnified1DS32S32: + return "NVPTXISD::TexUnified1DS32S32"; + case NVPTXISD::TexUnified1DS32Float: + return "NVPTXISD::TexUnified1DS32Float"; + case NVPTXISD::TexUnified1DS32FloatLevel: + return "NVPTXISD::TexUnified1DS32FloatLevel"; + case NVPTXISD::TexUnified1DS32FloatGrad: + return "NVPTXISD::TexUnified1DS32FloatGrad"; + case NVPTXISD::TexUnified1DU32S32: + return "NVPTXISD::TexUnified1DU32S32"; + case NVPTXISD::TexUnified1DU32Float: + return "NVPTXISD::TexUnified1DU32Float"; + case NVPTXISD::TexUnified1DU32FloatLevel: + return "NVPTXISD::TexUnified1DU32FloatLevel"; + case NVPTXISD::TexUnified1DU32FloatGrad: + return "NVPTXISD::TexUnified1DU32FloatGrad"; + case NVPTXISD::TexUnified1DArrayFloatS32: + return "NVPTXISD::TexUnified1DArrayFloatS32"; + case NVPTXISD::TexUnified1DArrayFloatFloat: + return "NVPTXISD::TexUnified1DArrayFloatFloat"; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified1DArrayS32S32: + return "NVPTXISD::TexUnified1DArrayS32S32"; + case NVPTXISD::TexUnified1DArrayS32Float: + return "NVPTXISD::TexUnified1DArrayS32Float"; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; + case NVPTXISD::TexUnified1DArrayU32S32: + return "NVPTXISD::TexUnified1DArrayU32S32"; + case NVPTXISD::TexUnified1DArrayU32Float: + return "NVPTXISD::TexUnified1DArrayU32Float"; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; + case NVPTXISD::TexUnified2DFloatS32: + return "NVPTXISD::TexUnified2DFloatS32"; + case NVPTXISD::TexUnified2DFloatFloat: + return "NVPTXISD::TexUnified2DFloatFloat"; + case NVPTXISD::TexUnified2DFloatFloatLevel: + return "NVPTXISD::TexUnified2DFloatFloatLevel"; + case NVPTXISD::TexUnified2DFloatFloatGrad: + return "NVPTXISD::TexUnified2DFloatFloatGrad"; + case NVPTXISD::TexUnified2DS32S32: + return "NVPTXISD::TexUnified2DS32S32"; + case NVPTXISD::TexUnified2DS32Float: + return "NVPTXISD::TexUnified2DS32Float"; + case NVPTXISD::TexUnified2DS32FloatLevel: + return "NVPTXISD::TexUnified2DS32FloatLevel"; + case NVPTXISD::TexUnified2DS32FloatGrad: + return "NVPTXISD::TexUnified2DS32FloatGrad"; + case NVPTXISD::TexUnified2DU32S32: + return "NVPTXISD::TexUnified2DU32S32"; + case NVPTXISD::TexUnified2DU32Float: + return "NVPTXISD::TexUnified2DU32Float"; + case NVPTXISD::TexUnified2DU32FloatLevel: + return "NVPTXISD::TexUnified2DU32FloatLevel"; + case NVPTXISD::TexUnified2DU32FloatGrad: + return "NVPTXISD::TexUnified2DU32FloatGrad"; + case NVPTXISD::TexUnified2DArrayFloatS32: + return "NVPTXISD::TexUnified2DArrayFloatS32"; + case NVPTXISD::TexUnified2DArrayFloatFloat: + return "NVPTXISD::TexUnified2DArrayFloatFloat"; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified2DArrayS32S32: + return "NVPTXISD::TexUnified2DArrayS32S32"; + case NVPTXISD::TexUnified2DArrayS32Float: + return "NVPTXISD::TexUnified2DArrayS32Float"; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; + case NVPTXISD::TexUnified2DArrayU32S32: + return "NVPTXISD::TexUnified2DArrayU32S32"; + case NVPTXISD::TexUnified2DArrayU32Float: + return "NVPTXISD::TexUnified2DArrayU32Float"; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; + case NVPTXISD::TexUnified3DFloatS32: + return "NVPTXISD::TexUnified3DFloatS32"; + case NVPTXISD::TexUnified3DFloatFloat: + return "NVPTXISD::TexUnified3DFloatFloat"; + case NVPTXISD::TexUnified3DFloatFloatLevel: + return "NVPTXISD::TexUnified3DFloatFloatLevel"; + case NVPTXISD::TexUnified3DFloatFloatGrad: + return "NVPTXISD::TexUnified3DFloatFloatGrad"; + case NVPTXISD::TexUnified3DS32S32: + return "NVPTXISD::TexUnified3DS32S32"; + case NVPTXISD::TexUnified3DS32Float: + return "NVPTXISD::TexUnified3DS32Float"; + case NVPTXISD::TexUnified3DS32FloatLevel: + return "NVPTXISD::TexUnified3DS32FloatLevel"; + case NVPTXISD::TexUnified3DS32FloatGrad: + return "NVPTXISD::TexUnified3DS32FloatGrad"; + case NVPTXISD::TexUnified3DU32S32: + return "NVPTXISD::TexUnified3DU32S32"; + case NVPTXISD::TexUnified3DU32Float: + return "NVPTXISD::TexUnified3DU32Float"; + case NVPTXISD::TexUnified3DU32FloatLevel: + return "NVPTXISD::TexUnified3DU32FloatLevel"; + case NVPTXISD::TexUnified3DU32FloatGrad: + return "NVPTXISD::TexUnified3DU32FloatGrad"; + case NVPTXISD::TexUnifiedCubeFloatFloat: + return "NVPTXISD::TexUnifiedCubeFloatFloat"; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeS32Float: + return "NVPTXISD::TexUnifiedCubeS32Float"; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeU32Float: + return "NVPTXISD::TexUnifiedCubeU32Float"; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + return "NVPTXISD::TexUnifiedCubeArrayS32Float"; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + return "NVPTXISD::TexUnifiedCubeArrayU32Float"; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; + case NVPTXISD::Tld4UnifiedR2DS64Float: + return "NVPTXISD::Tld4UnifiedR2DS64Float"; + case NVPTXISD::Tld4UnifiedG2DS64Float: + return "NVPTXISD::Tld4UnifiedG2DS64Float"; + case NVPTXISD::Tld4UnifiedB2DS64Float: + return "NVPTXISD::Tld4UnifiedB2DS64Float"; + case NVPTXISD::Tld4UnifiedA2DS64Float: + return "NVPTXISD::Tld4UnifiedA2DS64Float"; + case NVPTXISD::Tld4UnifiedR2DU64Float: + return "NVPTXISD::Tld4UnifiedR2DU64Float"; + case NVPTXISD::Tld4UnifiedG2DU64Float: + return "NVPTXISD::Tld4UnifiedG2DU64Float"; + case NVPTXISD::Tld4UnifiedB2DU64Float: + return "NVPTXISD::Tld4UnifiedB2DU64Float"; + case NVPTXISD::Tld4UnifiedA2DU64Float: + return "NVPTXISD::Tld4UnifiedA2DU64Float"; + + case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; + case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; + case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; + case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; + case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; + case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; + case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; + case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; + case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; + case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; + case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; + + case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; + case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; + case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; + case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; + case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; + case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; + case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; + case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; + case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; + case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; + case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; + + case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; + case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; + case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; + case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; + case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; + case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; + case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; + case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; + case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; + case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; + case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; + + case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; + case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; + case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; + case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; + case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; + case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; + case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; + case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; + case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; + case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; + case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; + + case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; + case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; + case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; + case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; + case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; + case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; + case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; + case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; + case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; + case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; + case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; + + case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; + case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; + case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; + case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; + case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; + case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; + case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; + case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; + case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; + case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; + case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; + + case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; + case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; + case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; + case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; + case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; + case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; + case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; + case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; + case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; + case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; + case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; + + case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; + case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; + case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; + case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; + case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; + case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; + case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; + case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; + case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; + case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; + case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; + + case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; + case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; + case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; + case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; + case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; + case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; + case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; + case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; + case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; + case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; + case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; + + case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; + case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; + case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; + case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; + case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; + case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; + case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; + case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; + case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; + case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; + case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; + + case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; + case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; + case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; + case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; + case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; + case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; + case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; + case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; + case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; + case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; + case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; + + case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; + case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; + case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; + case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; + case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; + case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; + case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; + case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; + case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; + case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; + case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; + + case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; + case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; + case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; + case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; + case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; + case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; + case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; + case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; + case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; + case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; + case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; + + case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; + case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; + case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; + case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; + case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; + case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; + case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; + case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; + case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; + case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; + case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; + + case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; + case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; + case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; + case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; + case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; + case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; + case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; + case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; + case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; + case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; + case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; } } -bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const { - return VT == MVT::i1; +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } SDValue @@ -361,7 +890,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, O << "()"; } else { O << "("; - if (retTy->isPrimitiveType() || retTy->isIntegerTy()) { + if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { unsigned size = 0; if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { size = ITy->getBitWidth(); @@ -377,26 +906,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, } else if (isa<PointerType>(retTy)) { O << ".param .b" << getPointerTy().getSizeInBits() << " _"; } else { - if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, retTy, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - // TODO: no need to loop - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } - O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]"; + if((retTy->getTypeID() == Type::StructTyID) || + isa<VectorType>(retTy)) { + O << ".param .align " + << retAlignment + << " .b8 _[" + << getDataLayout()->getTypeAllocSize(retTy) << "]"; } else { assert(false && "Unknown return type"); } @@ -526,7 +1041,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; - ArgListTy &Args = CLI.Args; + ArgListTy &Args = CLI.getArgs(); Type *retTy = CLI.RetTy; ImmutableCallSite *CS = CLI.CS; @@ -565,7 +1080,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Ty->isAggregateType()) { // aggregate SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, Ty, vtparts); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align <align> .b8 .param<n>[<size>]; @@ -575,36 +1091,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps, 5); + DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); - } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, &CopyParamOps[0], 5, - elemtype, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - ++OIdx; + unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + SDValue StVal = OutVals[OIdx]; + if (elemtype.getSizeInBits() < 16) { + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(Offsets[j], MVT::i32), + StVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, CopyParamOps, + elemtype, MachinePointerInfo(), + ArgAlign); + InFlag = Chain.getValue(1); + ++OIdx; } if (vtparts.size() > 0) --OIdx; @@ -621,7 +1129,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps, 5); + DeclareParamOps); InFlag = Chain.getValue(1); unsigned NumElts = ObjectVT.getVectorNumElements(); EVT EltVT = ObjectVT.getVectorElementType(); @@ -644,7 +1152,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(0, MVT::i32), Elt, InFlag }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, &CopyParamOps[0], 5, + CopyParamVTs, CopyParamOps, MemVT, MachinePointerInfo()); InFlag = Chain.getValue(1); } else if (NumElts == 2) { @@ -661,7 +1169,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(0, MVT::i32), Elt0, Elt1, InFlag }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl, - CopyParamVTs, &CopyParamOps[0], 6, + CopyParamVTs, CopyParamOps, MemVT, MachinePointerInfo()); InFlag = Chain.getValue(1); } else { @@ -735,9 +1243,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(InFlag); SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, &Ops[0], - Ops.size(), MemVT, - MachinePointerInfo()); + Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops, + MemVT, MachinePointerInfo()); InFlag = Chain.getValue(1); curOffset += PerStoreOffset; } @@ -762,7 +1269,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(sz, MVT::i32), DAG.getConstant(0, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareParamOps, 5); + DeclareParamOps); InFlag = Chain.getValue(1); SDValue OutV = OutVals[OIdx]; if (needExtend) { @@ -781,7 +1288,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, opcode = NVPTXISD::StoreParamU32; else if (Outs[OIdx].Flags.isSExt()) opcode = NVPTXISD::StoreParamS32; - Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 5, + Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, VT, MachinePointerInfo()); InFlag = Chain.getValue(1); @@ -790,13 +1297,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // struct or vector SmallVector<EVT, 16> vtparts; + SmallVector<uint64_t, 16> Offsets; const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputeValueVTs(*this, PTy->getElementType(), vtparts); + ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); // declare .param .align <align> .b8 .param<n>[<size>]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). @@ -806,40 +1315,30 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps, 5); + DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + SDValue srcAddr = + DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], + DAG.getConstant(curOffset, getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, + PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], - DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, - 0); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), theVal, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, 5, elemtype, - MachinePointerInfo()); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(curOffset, MVT::i32), theVal, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - } + InFlag = Chain.getValue(1); } ++paramCount; } @@ -856,8 +1355,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // .param .align 16 .b8 retval0[<size-in-bytes>], or // .param .b<size-in-bits> retval0 unsigned resultsz = TD->getTypeAllocSizeInBits(retTy); - if (retTy->isPrimitiveType() || retTy->isIntegerTy() || - retTy->isPointerTy()) { + if (retTy->isSingleValueType()) { // Scalar needs to be at least 32bit wide if (resultsz < 32) resultsz = 32; @@ -866,7 +1364,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(resultsz, MVT::i32), DAG.getConstant(0, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, - DeclareRetOps, 5); + DeclareRetOps); InFlag = Chain.getValue(1); } else { retAlignment = getArgumentAlignment(Callee, CS, retTy, 0); @@ -876,7 +1374,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(resultsz / 8, MVT::i32), DAG.getConstant(0, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, - DeclareRetOps, 5); + DeclareRetOps); InFlag = Chain.getValue(1); } } @@ -896,7 +1394,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue ProtoOps[] = { Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, }; - Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3); + Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); InFlag = Chain.getValue(1); } // Op to just print "call" @@ -905,20 +1403,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag }; Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall), - dl, PrintCallVTs, PrintCallOps, 3); + dl, PrintCallVTs, PrintCallOps); InFlag = Chain.getValue(1); // Ops to print out the function name SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CallVoidOps[] = { Chain, Callee, InFlag }; - Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3); + Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); InFlag = Chain.getValue(1); // Ops to print out the param list SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CallArgBeginOps[] = { Chain, InFlag }; Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, - CallArgBeginOps, 2); + CallArgBeginOps); InFlag = Chain.getValue(1); for (unsigned i = 0, e = paramCount; i != e; ++i) { @@ -930,27 +1428,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32), DAG.getConstant(i, MVT::i32), InFlag }; - Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4); + Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); InFlag = Chain.getValue(1); } SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32), InFlag }; - Chain = - DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps, 3); + Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); InFlag = Chain.getValue(1); if (!Func) { SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3); + Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); InFlag = Chain.getValue(1); } // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - unsigned resoffset = 0; if (retTy && retTy->isVectorTy()) { EVT ObjectVT = getValueType(retTy); unsigned NumElts = ObjectVT.getVectorNumElements(); @@ -959,29 +1455,29 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ObjectVT) == NumElts && "Vector was not scalarized"); unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 16 ? true : false; + bool needTruncate = sz < 8 ? true : false; if (NumElts == 1) { // Just a simple load - std::vector<EVT> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + SmallVector<EVT, 4> LoadRetVTs; + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); } else LoadRetVTs.push_back(EltVT); LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue); - std::vector<SDValue> LoadRetOps; + SmallVector<SDValue, 4> LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParam, dl, - DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], - LoadRetOps.size(), EltVT, MachinePointerInfo()); + DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); Chain = retval.getValue(1); InFlag = retval.getValue(2); SDValue Ret0 = retval; @@ -990,10 +1486,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InVals.push_back(Ret0); } else if (NumElts == 2) { // LoadV2 - std::vector<EVT> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + SmallVector<EVT, 4> LoadRetVTs; + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); LoadRetVTs.push_back(MVT::i16); @@ -1003,15 +1500,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue); - std::vector<SDValue> LoadRetOps; + SmallVector<SDValue, 4> LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParamV2, dl, - DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], - LoadRetOps.size(), EltVT, MachinePointerInfo()); + DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); Chain = retval.getValue(2); InFlag = retval.getValue(3); SDValue Ret0 = retval.getValue(0); @@ -1037,9 +1533,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); for (unsigned i = 0; i < NumElts; i += VecSize) { SmallVector<EVT, 8> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 for (unsigned j = 0; j < VecSize; ++j) LoadRetVTs.push_back(MVT::i16); @@ -1055,8 +1552,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( - Opc, dl, DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), - &LoadRetOps[0], LoadRetOps.size(), EltVT, MachinePointerInfo()); + Opc, dl, DAG.getVTList(LoadRetVTs), + LoadRetOps, EltVT, MachinePointerInfo()); if (VecSize == 2) { Chain = retval.getValue(2); InFlag = retval.getValue(3); @@ -1078,10 +1575,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else { SmallVector<EVT, 16> VTs; - ComputePTXValueVTs(*this, retTy, VTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); + unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); + unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); bool needTruncate = sz < 8 ? true : false; if (VTs[i].isInteger() && (sz < 8)) sz = 8; @@ -1107,19 +1607,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 4> LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); - LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParam, dl, - DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], - LoadRetOps.size(), TheLoadType, MachinePointerInfo()); + DAG.getVTList(LoadRetVTs), LoadRetOps, + TheLoadType, MachinePointerInfo(), AlignI); Chain = retval.getValue(1); InFlag = retval.getValue(2); SDValue Ret0 = retval.getValue(0); if (needTruncate) Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); InVals.push_back(Ret0); - resoffset += sz / 8; } } } @@ -1154,8 +1653,128 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(j))); } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), &Ops[0], - Ops.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); +} + +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } } SDValue @@ -1178,6 +1797,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -1210,7 +1834,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() // in LegalizeDAG.cpp which also uses MergeValues. SDValue Ops[] = { result, LD->getChain() }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getMergeValues(Ops, dl); } SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1253,13 +1877,28 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { break; } + MemSDNode *MemSD = cast<MemSDNode>(N); + const DataLayout *TD = getDataLayout(); + + unsigned Align = MemSD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } + unsigned Opcode = 0; EVT EltVT = ValVT.getVectorElementType(); unsigned NumElts = ValVT.getVectorNumElements(); // Since StoreV2 is a target node, we cannot rely on DAG type legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // stored type to i16 and propogate the "real" type as the memory type. + // stored type to i16 and propagate the "real" type as the memory type. bool NeedExt = false; if (EltVT.getSizeInBits() < 16) NeedExt = true; @@ -1295,10 +1934,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { Ops.push_back(N->getOperand(i)); } - MemSDNode *MemSD = cast<MemSDNode>(N); - SDValue NewSt = DAG.getMemIntrinsicNode( - Opcode, DL, DAG.getVTList(MVT::Other), &Ops[0], Ops.size(), + Opcode, DL, DAG.getVTList(MVT::Other), Ops, MemSD->getMemoryVT(), MemSD->getMemOperand()); //return DCI.CombineTo(N, NewSt, true); @@ -1391,7 +2028,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const Function *F = MF.getFunction(); const AttributeSet &PAL = F->getAttributes(); - const TargetLowering *TLI = nvTM->getTargetLowering(); + const TargetLowering *TLI = DAG.getTarget().getTargetLowering(); SDValue Root = DAG.getRoot(); std::vector<SDValue> OutChains; @@ -1430,7 +2067,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (isImageOrSamplerVal( theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() - : 0))) { + : nullptr))) { assert(isKernel && "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, MVT::i32)); continue; @@ -1445,8 +2082,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { - EVT partVT = vtparts[parti]; - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT)); + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); ++InsIdx; } if (vtparts.size() > 0) @@ -1684,8 +2320,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( //} if (!OutChains.empty()) - DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &OutChains[0], - OutChains.size())); + DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); return Chain; } @@ -1727,7 +2362,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), &Ops[0], 3, + DAG.getVTList(MVT::Other), Ops, EltVT, MachinePointerInfo()); } else if (NumElts == 2) { @@ -1743,7 +2378,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0, StoreVal1 }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl, - DAG.getVTList(MVT::Other), &Ops[0], 4, + DAG.getVTList(MVT::Other), Ops, EltVT, MachinePointerInfo()); } else { // V4 stores @@ -1763,7 +2398,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Offset = 0; EVT VecVT = - EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize); + EVT::getVectorVT(F->getContext(), EltVT, VecSize); unsigned PerStoreOffset = TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1815,19 +2450,17 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); Chain = - DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), &Ops[0], - Ops.size(), EltVT, MachinePointerInfo()); + DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops, + EltVT, MachinePointerInfo()); Offset += PerStoreOffset; } } } else { SmallVector<EVT, 16> ValVTs; - // const_cast is necessary since we are still using an LLVM version from - // before the type system re-write. - ComputePTXValueVTs(*this, RetTy, ValVTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - unsigned SizeSoFar = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { SDValue theVal = OutVals[i]; EVT TheValType = theVal.getValueType(); @@ -1851,16 +2484,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, else if (TmpVal.getValueType().getSizeInBits() < 16) TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); - SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal }; + SDValue Ops[] = { + Chain, + DAG.getConstant(Offsets[i], MVT::i32), + TmpVal }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), &Ops[0], - 3, TheStoreType, + DAG.getVTList(MVT::Other), Ops, + TheStoreType, MachinePointerInfo()); - if(TheValType.isVector()) - SizeSoFar += - TheStoreType.getVectorElementType().getStoreSizeInBits() / 8; - else - SizeSoFar += TheStoreType.getStoreSizeInBits()/8; } } } @@ -1892,6 +2523,702 @@ bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const { return false; } +static unsigned getOpcForTextureInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + + case Intrinsic::nvvm_tex_1d_v4f32_s32: + return NVPTXISD::Tex1DFloatS32; + case Intrinsic::nvvm_tex_1d_v4f32_f32: + return NVPTXISD::Tex1DFloatFloat; + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_v4s32_s32: + return NVPTXISD::Tex1DS32S32; + case Intrinsic::nvvm_tex_1d_v4s32_f32: + return NVPTXISD::Tex1DS32Float; + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + return NVPTXISD::Tex1DS32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + return NVPTXISD::Tex1DS32FloatGrad; + case Intrinsic::nvvm_tex_1d_v4u32_s32: + return NVPTXISD::Tex1DU32S32; + case Intrinsic::nvvm_tex_1d_v4u32_f32: + return NVPTXISD::Tex1DU32Float; + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + return NVPTXISD::Tex1DU32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + return NVPTXISD::Tex1DU32FloatGrad; + + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + return NVPTXISD::Tex1DArrayFloatS32; + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloat; + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + return NVPTXISD::Tex1DArrayS32S32; + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + return NVPTXISD::Tex1DArrayS32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + return NVPTXISD::Tex1DArrayU32S32; + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + return NVPTXISD::Tex1DArrayU32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_v4f32_s32: + return NVPTXISD::Tex2DFloatS32; + case Intrinsic::nvvm_tex_2d_v4f32_f32: + return NVPTXISD::Tex2DFloatFloat; + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_v4s32_s32: + return NVPTXISD::Tex2DS32S32; + case Intrinsic::nvvm_tex_2d_v4s32_f32: + return NVPTXISD::Tex2DS32Float; + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + return NVPTXISD::Tex2DS32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + return NVPTXISD::Tex2DS32FloatGrad; + case Intrinsic::nvvm_tex_2d_v4u32_s32: + return NVPTXISD::Tex2DU32S32; + case Intrinsic::nvvm_tex_2d_v4u32_f32: + return NVPTXISD::Tex2DU32Float; + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + return NVPTXISD::Tex2DU32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + return NVPTXISD::Tex2DU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + return NVPTXISD::Tex2DArrayFloatS32; + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloat; + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + return NVPTXISD::Tex2DArrayS32S32; + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + return NVPTXISD::Tex2DArrayS32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + return NVPTXISD::Tex2DArrayU32S32; + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + return NVPTXISD::Tex2DArrayU32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_3d_v4f32_s32: + return NVPTXISD::Tex3DFloatS32; + case Intrinsic::nvvm_tex_3d_v4f32_f32: + return NVPTXISD::Tex3DFloatFloat; + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatGrad; + case Intrinsic::nvvm_tex_3d_v4s32_s32: + return NVPTXISD::Tex3DS32S32; + case Intrinsic::nvvm_tex_3d_v4s32_f32: + return NVPTXISD::Tex3DS32Float; + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + return NVPTXISD::Tex3DS32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + return NVPTXISD::Tex3DS32FloatGrad; + case Intrinsic::nvvm_tex_3d_v4u32_s32: + return NVPTXISD::Tex3DU32S32; + case Intrinsic::nvvm_tex_3d_v4u32_f32: + return NVPTXISD::Tex3DU32Float; + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + return NVPTXISD::Tex3DU32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + return NVPTXISD::Tex3DU32FloatGrad; + + case Intrinsic::nvvm_tex_cube_v4f32_f32: + return NVPTXISD::TexCubeFloatFloat; + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + return NVPTXISD::TexCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_v4s32_f32: + return NVPTXISD::TexCubeS32Float; + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + return NVPTXISD::TexCubeS32FloatLevel; + case Intrinsic::nvvm_tex_cube_v4u32_f32: + return NVPTXISD::TexCubeU32Float; + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + return NVPTXISD::TexCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + return NVPTXISD::TexCubeArrayS32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + return NVPTXISD::TexCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + return NVPTXISD::TexCubeArrayU32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + return NVPTXISD::TexCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + return NVPTXISD::Tld4R2DFloatFloat; + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + return NVPTXISD::Tld4G2DFloatFloat; + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + return NVPTXISD::Tld4B2DFloatFloat; + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + return NVPTXISD::Tld4A2DFloatFloat; + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + return NVPTXISD::Tld4R2DS64Float; + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + return NVPTXISD::Tld4G2DS64Float; + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + return NVPTXISD::Tld4B2DS64Float; + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + return NVPTXISD::Tld4A2DS64Float; + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + return NVPTXISD::Tld4R2DU64Float; + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + return NVPTXISD::Tld4G2DU64Float; + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + return NVPTXISD::Tld4B2DU64Float; + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + return NVPTXISD::Tld4A2DU64Float; + + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + return NVPTXISD::TexUnified1DFloatS32; + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + return NVPTXISD::TexUnified1DS32S32; + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + return NVPTXISD::TexUnified1DS32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + return NVPTXISD::TexUnified1DU32S32; + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + return NVPTXISD::TexUnified1DU32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + return NVPTXISD::TexUnified1DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + return NVPTXISD::TexUnified1DArrayS32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + return NVPTXISD::TexUnified1DArrayU32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + return NVPTXISD::TexUnified2DFloatS32; + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + return NVPTXISD::TexUnified2DS32S32; + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + return NVPTXISD::TexUnified2DS32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + return NVPTXISD::TexUnified2DU32S32; + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + return NVPTXISD::TexUnified2DU32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + return NVPTXISD::TexUnified2DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + return NVPTXISD::TexUnified2DArrayS32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + return NVPTXISD::TexUnified2DArrayU32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + return NVPTXISD::TexUnified3DFloatS32; + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloat; + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + return NVPTXISD::TexUnified3DS32S32; + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + return NVPTXISD::TexUnified3DS32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + return NVPTXISD::TexUnified3DU32S32; + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + return NVPTXISD::TexUnified3DU32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedR2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedG2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedB2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedA2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedR2DS64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedG2DS64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedB2DS64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedA2DS64Float; + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedR2DU64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedG2DU64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedB2DU64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedA2DU64Float; + } +} + +static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + case Intrinsic::nvvm_suld_1d_i8_clamp: + return NVPTXISD::Suld1DI8Clamp; + case Intrinsic::nvvm_suld_1d_i16_clamp: + return NVPTXISD::Suld1DI16Clamp; + case Intrinsic::nvvm_suld_1d_i32_clamp: + return NVPTXISD::Suld1DI32Clamp; + case Intrinsic::nvvm_suld_1d_i64_clamp: + return NVPTXISD::Suld1DI64Clamp; + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + return NVPTXISD::Suld1DV2I8Clamp; + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + return NVPTXISD::Suld1DV2I16Clamp; + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + return NVPTXISD::Suld1DV2I32Clamp; + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + return NVPTXISD::Suld1DV2I64Clamp; + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + return NVPTXISD::Suld1DV4I8Clamp; + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + return NVPTXISD::Suld1DV4I16Clamp; + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + return NVPTXISD::Suld1DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + return NVPTXISD::Suld1DArrayI8Clamp; + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + return NVPTXISD::Suld1DArrayI16Clamp; + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + return NVPTXISD::Suld1DArrayI32Clamp; + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + return NVPTXISD::Suld1DArrayI64Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + return NVPTXISD::Suld1DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + return NVPTXISD::Suld1DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + return NVPTXISD::Suld1DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + return NVPTXISD::Suld1DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + return NVPTXISD::Suld1DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + return NVPTXISD::Suld1DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + return NVPTXISD::Suld1DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_2d_i8_clamp: + return NVPTXISD::Suld2DI8Clamp; + case Intrinsic::nvvm_suld_2d_i16_clamp: + return NVPTXISD::Suld2DI16Clamp; + case Intrinsic::nvvm_suld_2d_i32_clamp: + return NVPTXISD::Suld2DI32Clamp; + case Intrinsic::nvvm_suld_2d_i64_clamp: + return NVPTXISD::Suld2DI64Clamp; + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + return NVPTXISD::Suld2DV2I8Clamp; + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + return NVPTXISD::Suld2DV2I16Clamp; + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + return NVPTXISD::Suld2DV2I32Clamp; + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + return NVPTXISD::Suld2DV2I64Clamp; + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + return NVPTXISD::Suld2DV4I8Clamp; + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + return NVPTXISD::Suld2DV4I16Clamp; + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + return NVPTXISD::Suld2DV4I32Clamp; + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + return NVPTXISD::Suld2DArrayI8Clamp; + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + return NVPTXISD::Suld2DArrayI16Clamp; + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + return NVPTXISD::Suld2DArrayI32Clamp; + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + return NVPTXISD::Suld2DArrayI64Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + return NVPTXISD::Suld2DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + return NVPTXISD::Suld2DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + return NVPTXISD::Suld2DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + return NVPTXISD::Suld2DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + return NVPTXISD::Suld2DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + return NVPTXISD::Suld2DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + return NVPTXISD::Suld2DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_3d_i8_clamp: + return NVPTXISD::Suld3DI8Clamp; + case Intrinsic::nvvm_suld_3d_i16_clamp: + return NVPTXISD::Suld3DI16Clamp; + case Intrinsic::nvvm_suld_3d_i32_clamp: + return NVPTXISD::Suld3DI32Clamp; + case Intrinsic::nvvm_suld_3d_i64_clamp: + return NVPTXISD::Suld3DI64Clamp; + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + return NVPTXISD::Suld3DV2I8Clamp; + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + return NVPTXISD::Suld3DV2I16Clamp; + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + return NVPTXISD::Suld3DV2I32Clamp; + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + return NVPTXISD::Suld3DV2I64Clamp; + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + return NVPTXISD::Suld3DV4I8Clamp; + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + return NVPTXISD::Suld3DV4I16Clamp; + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + return NVPTXISD::Suld3DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_i8_trap: + return NVPTXISD::Suld1DI8Trap; + case Intrinsic::nvvm_suld_1d_i16_trap: + return NVPTXISD::Suld1DI16Trap; + case Intrinsic::nvvm_suld_1d_i32_trap: + return NVPTXISD::Suld1DI32Trap; + case Intrinsic::nvvm_suld_1d_i64_trap: + return NVPTXISD::Suld1DI64Trap; + case Intrinsic::nvvm_suld_1d_v2i8_trap: + return NVPTXISD::Suld1DV2I8Trap; + case Intrinsic::nvvm_suld_1d_v2i16_trap: + return NVPTXISD::Suld1DV2I16Trap; + case Intrinsic::nvvm_suld_1d_v2i32_trap: + return NVPTXISD::Suld1DV2I32Trap; + case Intrinsic::nvvm_suld_1d_v2i64_trap: + return NVPTXISD::Suld1DV2I64Trap; + case Intrinsic::nvvm_suld_1d_v4i8_trap: + return NVPTXISD::Suld1DV4I8Trap; + case Intrinsic::nvvm_suld_1d_v4i16_trap: + return NVPTXISD::Suld1DV4I16Trap; + case Intrinsic::nvvm_suld_1d_v4i32_trap: + return NVPTXISD::Suld1DV4I32Trap; + case Intrinsic::nvvm_suld_1d_array_i8_trap: + return NVPTXISD::Suld1DArrayI8Trap; + case Intrinsic::nvvm_suld_1d_array_i16_trap: + return NVPTXISD::Suld1DArrayI16Trap; + case Intrinsic::nvvm_suld_1d_array_i32_trap: + return NVPTXISD::Suld1DArrayI32Trap; + case Intrinsic::nvvm_suld_1d_array_i64_trap: + return NVPTXISD::Suld1DArrayI64Trap; + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + return NVPTXISD::Suld1DArrayV2I8Trap; + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + return NVPTXISD::Suld1DArrayV2I16Trap; + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + return NVPTXISD::Suld1DArrayV2I32Trap; + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + return NVPTXISD::Suld1DArrayV2I64Trap; + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + return NVPTXISD::Suld1DArrayV4I8Trap; + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + return NVPTXISD::Suld1DArrayV4I16Trap; + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + return NVPTXISD::Suld1DArrayV4I32Trap; + case Intrinsic::nvvm_suld_2d_i8_trap: + return NVPTXISD::Suld2DI8Trap; + case Intrinsic::nvvm_suld_2d_i16_trap: + return NVPTXISD::Suld2DI16Trap; + case Intrinsic::nvvm_suld_2d_i32_trap: + return NVPTXISD::Suld2DI32Trap; + case Intrinsic::nvvm_suld_2d_i64_trap: + return NVPTXISD::Suld2DI64Trap; + case Intrinsic::nvvm_suld_2d_v2i8_trap: + return NVPTXISD::Suld2DV2I8Trap; + case Intrinsic::nvvm_suld_2d_v2i16_trap: + return NVPTXISD::Suld2DV2I16Trap; + case Intrinsic::nvvm_suld_2d_v2i32_trap: + return NVPTXISD::Suld2DV2I32Trap; + case Intrinsic::nvvm_suld_2d_v2i64_trap: + return NVPTXISD::Suld2DV2I64Trap; + case Intrinsic::nvvm_suld_2d_v4i8_trap: + return NVPTXISD::Suld2DV4I8Trap; + case Intrinsic::nvvm_suld_2d_v4i16_trap: + return NVPTXISD::Suld2DV4I16Trap; + case Intrinsic::nvvm_suld_2d_v4i32_trap: + return NVPTXISD::Suld2DV4I32Trap; + case Intrinsic::nvvm_suld_2d_array_i8_trap: + return NVPTXISD::Suld2DArrayI8Trap; + case Intrinsic::nvvm_suld_2d_array_i16_trap: + return NVPTXISD::Suld2DArrayI16Trap; + case Intrinsic::nvvm_suld_2d_array_i32_trap: + return NVPTXISD::Suld2DArrayI32Trap; + case Intrinsic::nvvm_suld_2d_array_i64_trap: + return NVPTXISD::Suld2DArrayI64Trap; + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + return NVPTXISD::Suld2DArrayV2I8Trap; + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + return NVPTXISD::Suld2DArrayV2I16Trap; + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + return NVPTXISD::Suld2DArrayV2I32Trap; + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + return NVPTXISD::Suld2DArrayV2I64Trap; + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + return NVPTXISD::Suld2DArrayV4I8Trap; + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + return NVPTXISD::Suld2DArrayV4I16Trap; + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + return NVPTXISD::Suld2DArrayV4I32Trap; + case Intrinsic::nvvm_suld_3d_i8_trap: + return NVPTXISD::Suld3DI8Trap; + case Intrinsic::nvvm_suld_3d_i16_trap: + return NVPTXISD::Suld3DI16Trap; + case Intrinsic::nvvm_suld_3d_i32_trap: + return NVPTXISD::Suld3DI32Trap; + case Intrinsic::nvvm_suld_3d_i64_trap: + return NVPTXISD::Suld3DI64Trap; + case Intrinsic::nvvm_suld_3d_v2i8_trap: + return NVPTXISD::Suld3DV2I8Trap; + case Intrinsic::nvvm_suld_3d_v2i16_trap: + return NVPTXISD::Suld3DV2I16Trap; + case Intrinsic::nvvm_suld_3d_v2i32_trap: + return NVPTXISD::Suld3DV2I32Trap; + case Intrinsic::nvvm_suld_3d_v2i64_trap: + return NVPTXISD::Suld3DV2I64Trap; + case Intrinsic::nvvm_suld_3d_v4i8_trap: + return NVPTXISD::Suld3DV4I8Trap; + case Intrinsic::nvvm_suld_3d_v4i16_trap: + return NVPTXISD::Suld3DV4I16Trap; + case Intrinsic::nvvm_suld_3d_v4i32_trap: + return NVPTXISD::Suld3DV4I32Trap; + case Intrinsic::nvvm_suld_1d_i8_zero: + return NVPTXISD::Suld1DI8Zero; + case Intrinsic::nvvm_suld_1d_i16_zero: + return NVPTXISD::Suld1DI16Zero; + case Intrinsic::nvvm_suld_1d_i32_zero: + return NVPTXISD::Suld1DI32Zero; + case Intrinsic::nvvm_suld_1d_i64_zero: + return NVPTXISD::Suld1DI64Zero; + case Intrinsic::nvvm_suld_1d_v2i8_zero: + return NVPTXISD::Suld1DV2I8Zero; + case Intrinsic::nvvm_suld_1d_v2i16_zero: + return NVPTXISD::Suld1DV2I16Zero; + case Intrinsic::nvvm_suld_1d_v2i32_zero: + return NVPTXISD::Suld1DV2I32Zero; + case Intrinsic::nvvm_suld_1d_v2i64_zero: + return NVPTXISD::Suld1DV2I64Zero; + case Intrinsic::nvvm_suld_1d_v4i8_zero: + return NVPTXISD::Suld1DV4I8Zero; + case Intrinsic::nvvm_suld_1d_v4i16_zero: + return NVPTXISD::Suld1DV4I16Zero; + case Intrinsic::nvvm_suld_1d_v4i32_zero: + return NVPTXISD::Suld1DV4I32Zero; + case Intrinsic::nvvm_suld_1d_array_i8_zero: + return NVPTXISD::Suld1DArrayI8Zero; + case Intrinsic::nvvm_suld_1d_array_i16_zero: + return NVPTXISD::Suld1DArrayI16Zero; + case Intrinsic::nvvm_suld_1d_array_i32_zero: + return NVPTXISD::Suld1DArrayI32Zero; + case Intrinsic::nvvm_suld_1d_array_i64_zero: + return NVPTXISD::Suld1DArrayI64Zero; + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + return NVPTXISD::Suld1DArrayV2I8Zero; + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + return NVPTXISD::Suld1DArrayV2I16Zero; + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + return NVPTXISD::Suld1DArrayV2I32Zero; + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + return NVPTXISD::Suld1DArrayV2I64Zero; + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + return NVPTXISD::Suld1DArrayV4I8Zero; + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + return NVPTXISD::Suld1DArrayV4I16Zero; + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + return NVPTXISD::Suld1DArrayV4I32Zero; + case Intrinsic::nvvm_suld_2d_i8_zero: + return NVPTXISD::Suld2DI8Zero; + case Intrinsic::nvvm_suld_2d_i16_zero: + return NVPTXISD::Suld2DI16Zero; + case Intrinsic::nvvm_suld_2d_i32_zero: + return NVPTXISD::Suld2DI32Zero; + case Intrinsic::nvvm_suld_2d_i64_zero: + return NVPTXISD::Suld2DI64Zero; + case Intrinsic::nvvm_suld_2d_v2i8_zero: + return NVPTXISD::Suld2DV2I8Zero; + case Intrinsic::nvvm_suld_2d_v2i16_zero: + return NVPTXISD::Suld2DV2I16Zero; + case Intrinsic::nvvm_suld_2d_v2i32_zero: + return NVPTXISD::Suld2DV2I32Zero; + case Intrinsic::nvvm_suld_2d_v2i64_zero: + return NVPTXISD::Suld2DV2I64Zero; + case Intrinsic::nvvm_suld_2d_v4i8_zero: + return NVPTXISD::Suld2DV4I8Zero; + case Intrinsic::nvvm_suld_2d_v4i16_zero: + return NVPTXISD::Suld2DV4I16Zero; + case Intrinsic::nvvm_suld_2d_v4i32_zero: + return NVPTXISD::Suld2DV4I32Zero; + case Intrinsic::nvvm_suld_2d_array_i8_zero: + return NVPTXISD::Suld2DArrayI8Zero; + case Intrinsic::nvvm_suld_2d_array_i16_zero: + return NVPTXISD::Suld2DArrayI16Zero; + case Intrinsic::nvvm_suld_2d_array_i32_zero: + return NVPTXISD::Suld2DArrayI32Zero; + case Intrinsic::nvvm_suld_2d_array_i64_zero: + return NVPTXISD::Suld2DArrayI64Zero; + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + return NVPTXISD::Suld2DArrayV2I8Zero; + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + return NVPTXISD::Suld2DArrayV2I16Zero; + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + return NVPTXISD::Suld2DArrayV2I32Zero; + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + return NVPTXISD::Suld2DArrayV2I64Zero; + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + return NVPTXISD::Suld2DArrayV4I8Zero; + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + return NVPTXISD::Suld2DArrayV4I16Zero; + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + return NVPTXISD::Suld2DArrayV4I32Zero; + case Intrinsic::nvvm_suld_3d_i8_zero: + return NVPTXISD::Suld3DI8Zero; + case Intrinsic::nvvm_suld_3d_i16_zero: + return NVPTXISD::Suld3DI16Zero; + case Intrinsic::nvvm_suld_3d_i32_zero: + return NVPTXISD::Suld3DI32Zero; + case Intrinsic::nvvm_suld_3d_i64_zero: + return NVPTXISD::Suld3DI64Zero; + case Intrinsic::nvvm_suld_3d_v2i8_zero: + return NVPTXISD::Suld3DV2I8Zero; + case Intrinsic::nvvm_suld_3d_v2i16_zero: + return NVPTXISD::Suld3DV2I16Zero; + case Intrinsic::nvvm_suld_3d_v2i32_zero: + return NVPTXISD::Suld3DV2I32Zero; + case Intrinsic::nvvm_suld_3d_v2i64_zero: + return NVPTXISD::Suld3DV2I64Zero; + case Intrinsic::nvvm_suld_3d_v4i8_zero: + return NVPTXISD::Suld3DV4I8Zero; + case Intrinsic::nvvm_suld_3d_v4i16_zero: + return NVPTXISD::Suld3DV4I16Zero; + case Intrinsic::nvvm_suld_3d_v4i32_zero: + return NVPTXISD::Suld3DV4I32Zero; + } +} + // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as // TgtMemIntrinsic // because we need the information that is only available in the "Value" type @@ -1928,23 +3255,456 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: + case Intrinsic::nvvm_ldu_global_p: { Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) Info.memVT = getValueType(I.getType()); - else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(); + else Info.memVT = getValueType(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) + Info.memVT = getValueType(I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(); else - Info.memVT = MVT::f32; + Info.memVT = getValueType(I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; Info.readMem = true; Info.writeMem = false; - Info.align = 0; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + return true; + } + case Intrinsic::nvvm_tex_1d_v4f32_s32: + case Intrinsic::nvvm_tex_1d_v4f32_f32: + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_v4f32_s32: + case Intrinsic::nvvm_tex_2d_v4f32_f32: + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_3d_v4f32_s32: + case Intrinsic::nvvm_tex_3d_v4f32_f32: + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_cube_v4f32_f32: + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: { + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4f32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + case Intrinsic::nvvm_tex_1d_v4s32_s32: + case Intrinsic::nvvm_tex_1d_v4s32_f32: + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_v4s32_s32: + case Intrinsic::nvvm_tex_2d_v4s32_f32: + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_3d_v4s32_s32: + case Intrinsic::nvvm_tex_3d_v4s32_f32: + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4s32_f32: + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4u32_f32: + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_v4u32_s32: + case Intrinsic::nvvm_tex_1d_v4u32_f32: + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_v4u32_s32: + case Intrinsic::nvvm_tex_2d_v4u32_f32: + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_3d_v4u32_s32: + case Intrinsic::nvvm_tex_3d_v4u32_f32: + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: { + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + case Intrinsic::nvvm_suld_1d_i8_clamp: + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_i8_clamp: + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_3d_i8_clamp: + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_i8_trap: + case Intrinsic::nvvm_suld_1d_v2i8_trap: + case Intrinsic::nvvm_suld_1d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_array_i8_trap: + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + case Intrinsic::nvvm_suld_2d_i8_trap: + case Intrinsic::nvvm_suld_2d_v2i8_trap: + case Intrinsic::nvvm_suld_2d_v4i8_trap: + case Intrinsic::nvvm_suld_2d_array_i8_trap: + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + case Intrinsic::nvvm_suld_3d_i8_trap: + case Intrinsic::nvvm_suld_3d_v2i8_trap: + case Intrinsic::nvvm_suld_3d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_i8_zero: + case Intrinsic::nvvm_suld_1d_v2i8_zero: + case Intrinsic::nvvm_suld_1d_v4i8_zero: + case Intrinsic::nvvm_suld_1d_array_i8_zero: + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + case Intrinsic::nvvm_suld_2d_i8_zero: + case Intrinsic::nvvm_suld_2d_v2i8_zero: + case Intrinsic::nvvm_suld_2d_v4i8_zero: + case Intrinsic::nvvm_suld_2d_array_i8_zero: + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + case Intrinsic::nvvm_suld_3d_i8_zero: + case Intrinsic::nvvm_suld_3d_v2i8_zero: + case Intrinsic::nvvm_suld_3d_v4i8_zero: { + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i8; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + case Intrinsic::nvvm_suld_1d_i16_clamp: + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_i16_clamp: + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_3d_i16_clamp: + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_i16_trap: + case Intrinsic::nvvm_suld_1d_v2i16_trap: + case Intrinsic::nvvm_suld_1d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_array_i16_trap: + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + case Intrinsic::nvvm_suld_2d_i16_trap: + case Intrinsic::nvvm_suld_2d_v2i16_trap: + case Intrinsic::nvvm_suld_2d_v4i16_trap: + case Intrinsic::nvvm_suld_2d_array_i16_trap: + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + case Intrinsic::nvvm_suld_3d_i16_trap: + case Intrinsic::nvvm_suld_3d_v2i16_trap: + case Intrinsic::nvvm_suld_3d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_i16_zero: + case Intrinsic::nvvm_suld_1d_v2i16_zero: + case Intrinsic::nvvm_suld_1d_v4i16_zero: + case Intrinsic::nvvm_suld_1d_array_i16_zero: + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + case Intrinsic::nvvm_suld_2d_i16_zero: + case Intrinsic::nvvm_suld_2d_v2i16_zero: + case Intrinsic::nvvm_suld_2d_v4i16_zero: + case Intrinsic::nvvm_suld_2d_array_i16_zero: + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + case Intrinsic::nvvm_suld_3d_i16_zero: + case Intrinsic::nvvm_suld_3d_v2i16_zero: + case Intrinsic::nvvm_suld_3d_v4i16_zero: { + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i16; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + case Intrinsic::nvvm_suld_1d_i32_clamp: + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_i32_clamp: + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_3d_i32_clamp: + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_i32_trap: + case Intrinsic::nvvm_suld_1d_v2i32_trap: + case Intrinsic::nvvm_suld_1d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_array_i32_trap: + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + case Intrinsic::nvvm_suld_2d_i32_trap: + case Intrinsic::nvvm_suld_2d_v2i32_trap: + case Intrinsic::nvvm_suld_2d_v4i32_trap: + case Intrinsic::nvvm_suld_2d_array_i32_trap: + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + case Intrinsic::nvvm_suld_3d_i32_trap: + case Intrinsic::nvvm_suld_3d_v2i32_trap: + case Intrinsic::nvvm_suld_3d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_i32_zero: + case Intrinsic::nvvm_suld_1d_v2i32_zero: + case Intrinsic::nvvm_suld_1d_v4i32_zero: + case Intrinsic::nvvm_suld_1d_array_i32_zero: + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + case Intrinsic::nvvm_suld_2d_i32_zero: + case Intrinsic::nvvm_suld_2d_v2i32_zero: + case Intrinsic::nvvm_suld_2d_v4i32_zero: + case Intrinsic::nvvm_suld_2d_array_i32_zero: + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + case Intrinsic::nvvm_suld_3d_i32_zero: + case Intrinsic::nvvm_suld_3d_v2i32_zero: + case Intrinsic::nvvm_suld_3d_v4i32_zero: { + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + case Intrinsic::nvvm_suld_1d_i64_clamp: + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_i64_clamp: + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_3d_i64_clamp: + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_i64_trap: + case Intrinsic::nvvm_suld_1d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_array_i64_trap: + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + case Intrinsic::nvvm_suld_2d_i64_trap: + case Intrinsic::nvvm_suld_2d_v2i64_trap: + case Intrinsic::nvvm_suld_2d_array_i64_trap: + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + case Intrinsic::nvvm_suld_3d_i64_trap: + case Intrinsic::nvvm_suld_3d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_i64_zero: + case Intrinsic::nvvm_suld_1d_v2i64_zero: + case Intrinsic::nvvm_suld_1d_array_i64_zero: + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + case Intrinsic::nvvm_suld_2d_i64_zero: + case Intrinsic::nvvm_suld_2d_v2i64_zero: + case Intrinsic::nvvm_suld_2d_array_i64_zero: + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + case Intrinsic::nvvm_suld_3d_i64_zero: + case Intrinsic::nvvm_suld_3d_v2i64_zero: { + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i64; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } } return false; } @@ -1999,6 +3759,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { switch (Constraint[0]) { default: break; + case 'b': case 'r': case 'h': case 'c': @@ -2018,6 +3779,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); case 'c': return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'h': @@ -2041,8 +3804,434 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { return 4; } +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, + CodeGenOpt::Level OptLevel) const { + const Function *F = MF.getFunction(); + const TargetOptions &TO = MF.getTarget().Options; + + // Always honor command-line argument + if (FMAContractLevelOpt.getNumOccurrences() > 0) { + return FMAContractLevelOpt > 0; + } else if (OptLevel == 0) { + // Do not contract if we're not optimizing the code + return false; + } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) { + // Honor TargetOptions flags that explicitly say fusion is okay + return true; + } else if (F->hasFnAttribute("unsafe-fp-math")) { + // Check for unsafe-fp-math=true coming from Clang + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + StringRef Val = Attr.getValueAsString(); + if (Val == "true") + return true; + } + + // We did not have a clear indication that fusion is allowed, so assume not + return false; +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + NVPTXTargetLowering *TLI = + (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo(); + if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, + OptLevel); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa<ConstantSDNode>(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast<MemSDNode>(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != 0) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { + APInt Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + if (Val.isIntN(OptSize)) { + return true; + } + return false; + } else { + if (Val.isSignedIntN(OptSize)) { + return true; + } + return false; + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + if (LHSSign != RHSSign) + return false; + + return true; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa<ConstantSDNode>(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + } + return SDValue(); +} + /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + const DataLayout *TD, SmallVectorImpl<SDValue> &Results) { EVT ResVT = N->getValueType(0); SDLoc DL(N); @@ -2070,12 +4259,26 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, break; } + LoadSDNode *LD = cast<LoadSDNode>(N); + + unsigned Align = LD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return; + } + EVT EltVT = ResVT.getVectorElementType(); unsigned NumElts = ResVT.getVectorNumElements(); // Since LoadV2 is a target node, we cannot rely on DAG type legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // loaded type to i16 and propogate the "real" type as the memory type. + // loaded type to i16 and propagate the "real" type as the memory type. bool NeedTrunc = false; if (EltVT.getSizeInBits() < 16) { EltVT = MVT::i16; @@ -2095,7 +4298,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, case 4: { Opcode = NVPTXISD::LoadV4; EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; - LdResVTs = DAG.getVTList(ListVTs, 5); + LdResVTs = DAG.getVTList(ListVTs); break; } } @@ -2106,14 +4309,12 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) OtherOps.push_back(N->getOperand(i)); - LoadSDNode *LD = cast<LoadSDNode>(N); - // The select routine does not have access to the LoadSDNode instance, so // pass along the extension information OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType())); - SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0], - OtherOps.size(), LD->getMemoryVT(), + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + LD->getMemoryVT(), LD->getMemOperand()); SmallVector<SDValue, 4> ScalarRes; @@ -2127,8 +4328,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SDValue LoadChain = NewLD.getValue(NumElts); - SDValue BuildVec = - DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts); + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); Results.push_back(BuildVec); Results.push_back(LoadChain); @@ -2162,7 +4362,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, // Since LDU/LDG are target nodes, we cannot rely on DAG type // legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // loaded type to i16 and propogate the "real" type as the memory type. + // loaded type to i16 and propagate the "real" type as the memory type. bool NeedTrunc = false; if (EltVT.getSizeInBits() < 16) { EltVT = MVT::i16; @@ -2208,7 +4408,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, break; } EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; - LdResVTs = DAG.getVTList(ListVTs, 5); + LdResVTs = DAG.getVTList(ListVTs); break; } } @@ -2225,9 +4425,9 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); - SDValue NewLD = DAG.getMemIntrinsicNode( - Opcode, DL, LdResVTs, &OtherOps[0], OtherOps.size(), - MemSD->getMemoryVT(), MemSD->getMemOperand()); + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + MemSD->getMemoryVT(), + MemSD->getMemOperand()); SmallVector<SDValue, 4> ScalarRes; @@ -2242,7 +4442,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SDValue LoadChain = NewLD.getValue(NumElts); SDValue BuildVec = - DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts); + DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); Results.push_back(BuildVec); Results.push_back(LoadChain); @@ -2264,8 +4464,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, // We make sure the memory type is i8, which will be used during isel // to select the proper instruction. SDValue NewLD = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, &Ops[0], - Ops.size(), MVT::i8, MemSD->getMemOperand()); + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, + MVT::i8, MemSD->getMemOperand()); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, NewLD.getValue(0))); @@ -2281,7 +4481,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( default: report_fatal_error("Unhandled custom legalization"); case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results); + ReplaceLoadVector(N, DAG, getDataLayout(), Results); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 66e708fcea07..bef6ed9faad6 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -16,7 +16,6 @@ #define NVPTXISELLOWERING_H #include "NVPTX.h" -#include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" @@ -50,6 +49,11 @@ enum NodeType { CallSeqBegin, CallSeqEnd, CallPrototype, + FUN_SHFL_CLAMP, + FUN_SHFR_CLAMP, + MUL_WIDE_SIGNED, + MUL_WIDE_UNSIGNED, + IMAD, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -70,78 +74,440 @@ enum NodeType { StoreParamU32, // to zext and store a <32bit value, not used currently StoreRetval, StoreRetvalV2, - StoreRetvalV4 + StoreRetvalV4, + + // Texture intrinsics + Tex1DFloatS32, + Tex1DFloatFloat, + Tex1DFloatFloatLevel, + Tex1DFloatFloatGrad, + Tex1DS32S32, + Tex1DS32Float, + Tex1DS32FloatLevel, + Tex1DS32FloatGrad, + Tex1DU32S32, + Tex1DU32Float, + Tex1DU32FloatLevel, + Tex1DU32FloatGrad, + Tex1DArrayFloatS32, + Tex1DArrayFloatFloat, + Tex1DArrayFloatFloatLevel, + Tex1DArrayFloatFloatGrad, + Tex1DArrayS32S32, + Tex1DArrayS32Float, + Tex1DArrayS32FloatLevel, + Tex1DArrayS32FloatGrad, + Tex1DArrayU32S32, + Tex1DArrayU32Float, + Tex1DArrayU32FloatLevel, + Tex1DArrayU32FloatGrad, + Tex2DFloatS32, + Tex2DFloatFloat, + Tex2DFloatFloatLevel, + Tex2DFloatFloatGrad, + Tex2DS32S32, + Tex2DS32Float, + Tex2DS32FloatLevel, + Tex2DS32FloatGrad, + Tex2DU32S32, + Tex2DU32Float, + Tex2DU32FloatLevel, + Tex2DU32FloatGrad, + Tex2DArrayFloatS32, + Tex2DArrayFloatFloat, + Tex2DArrayFloatFloatLevel, + Tex2DArrayFloatFloatGrad, + Tex2DArrayS32S32, + Tex2DArrayS32Float, + Tex2DArrayS32FloatLevel, + Tex2DArrayS32FloatGrad, + Tex2DArrayU32S32, + Tex2DArrayU32Float, + Tex2DArrayU32FloatLevel, + Tex2DArrayU32FloatGrad, + Tex3DFloatS32, + Tex3DFloatFloat, + Tex3DFloatFloatLevel, + Tex3DFloatFloatGrad, + Tex3DS32S32, + Tex3DS32Float, + Tex3DS32FloatLevel, + Tex3DS32FloatGrad, + Tex3DU32S32, + Tex3DU32Float, + Tex3DU32FloatLevel, + Tex3DU32FloatGrad, + TexCubeFloatFloat, + TexCubeFloatFloatLevel, + TexCubeS32Float, + TexCubeS32FloatLevel, + TexCubeU32Float, + TexCubeU32FloatLevel, + TexCubeArrayFloatFloat, + TexCubeArrayFloatFloatLevel, + TexCubeArrayS32Float, + TexCubeArrayS32FloatLevel, + TexCubeArrayU32Float, + TexCubeArrayU32FloatLevel, + Tld4R2DFloatFloat, + Tld4G2DFloatFloat, + Tld4B2DFloatFloat, + Tld4A2DFloatFloat, + Tld4R2DS64Float, + Tld4G2DS64Float, + Tld4B2DS64Float, + Tld4A2DS64Float, + Tld4R2DU64Float, + Tld4G2DU64Float, + Tld4B2DU64Float, + Tld4A2DU64Float, + TexUnified1DFloatS32, + TexUnified1DFloatFloat, + TexUnified1DFloatFloatLevel, + TexUnified1DFloatFloatGrad, + TexUnified1DS32S32, + TexUnified1DS32Float, + TexUnified1DS32FloatLevel, + TexUnified1DS32FloatGrad, + TexUnified1DU32S32, + TexUnified1DU32Float, + TexUnified1DU32FloatLevel, + TexUnified1DU32FloatGrad, + TexUnified1DArrayFloatS32, + TexUnified1DArrayFloatFloat, + TexUnified1DArrayFloatFloatLevel, + TexUnified1DArrayFloatFloatGrad, + TexUnified1DArrayS32S32, + TexUnified1DArrayS32Float, + TexUnified1DArrayS32FloatLevel, + TexUnified1DArrayS32FloatGrad, + TexUnified1DArrayU32S32, + TexUnified1DArrayU32Float, + TexUnified1DArrayU32FloatLevel, + TexUnified1DArrayU32FloatGrad, + TexUnified2DFloatS32, + TexUnified2DFloatFloat, + TexUnified2DFloatFloatLevel, + TexUnified2DFloatFloatGrad, + TexUnified2DS32S32, + TexUnified2DS32Float, + TexUnified2DS32FloatLevel, + TexUnified2DS32FloatGrad, + TexUnified2DU32S32, + TexUnified2DU32Float, + TexUnified2DU32FloatLevel, + TexUnified2DU32FloatGrad, + TexUnified2DArrayFloatS32, + TexUnified2DArrayFloatFloat, + TexUnified2DArrayFloatFloatLevel, + TexUnified2DArrayFloatFloatGrad, + TexUnified2DArrayS32S32, + TexUnified2DArrayS32Float, + TexUnified2DArrayS32FloatLevel, + TexUnified2DArrayS32FloatGrad, + TexUnified2DArrayU32S32, + TexUnified2DArrayU32Float, + TexUnified2DArrayU32FloatLevel, + TexUnified2DArrayU32FloatGrad, + TexUnified3DFloatS32, + TexUnified3DFloatFloat, + TexUnified3DFloatFloatLevel, + TexUnified3DFloatFloatGrad, + TexUnified3DS32S32, + TexUnified3DS32Float, + TexUnified3DS32FloatLevel, + TexUnified3DS32FloatGrad, + TexUnified3DU32S32, + TexUnified3DU32Float, + TexUnified3DU32FloatLevel, + TexUnified3DU32FloatGrad, + TexUnifiedCubeFloatFloat, + TexUnifiedCubeFloatFloatLevel, + TexUnifiedCubeS32Float, + TexUnifiedCubeS32FloatLevel, + TexUnifiedCubeU32Float, + TexUnifiedCubeU32FloatLevel, + TexUnifiedCubeArrayFloatFloat, + TexUnifiedCubeArrayFloatFloatLevel, + TexUnifiedCubeArrayS32Float, + TexUnifiedCubeArrayS32FloatLevel, + TexUnifiedCubeArrayU32Float, + TexUnifiedCubeArrayU32FloatLevel, + Tld4UnifiedR2DFloatFloat, + Tld4UnifiedG2DFloatFloat, + Tld4UnifiedB2DFloatFloat, + Tld4UnifiedA2DFloatFloat, + Tld4UnifiedR2DS64Float, + Tld4UnifiedG2DS64Float, + Tld4UnifiedB2DS64Float, + Tld4UnifiedA2DS64Float, + Tld4UnifiedR2DU64Float, + Tld4UnifiedG2DU64Float, + Tld4UnifiedB2DU64Float, + Tld4UnifiedA2DU64Float, + + // Surface intrinsics + Suld1DI8Clamp, + Suld1DI16Clamp, + Suld1DI32Clamp, + Suld1DI64Clamp, + Suld1DV2I8Clamp, + Suld1DV2I16Clamp, + Suld1DV2I32Clamp, + Suld1DV2I64Clamp, + Suld1DV4I8Clamp, + Suld1DV4I16Clamp, + Suld1DV4I32Clamp, + + Suld1DArrayI8Clamp, + Suld1DArrayI16Clamp, + Suld1DArrayI32Clamp, + Suld1DArrayI64Clamp, + Suld1DArrayV2I8Clamp, + Suld1DArrayV2I16Clamp, + Suld1DArrayV2I32Clamp, + Suld1DArrayV2I64Clamp, + Suld1DArrayV4I8Clamp, + Suld1DArrayV4I16Clamp, + Suld1DArrayV4I32Clamp, + + Suld2DI8Clamp, + Suld2DI16Clamp, + Suld2DI32Clamp, + Suld2DI64Clamp, + Suld2DV2I8Clamp, + Suld2DV2I16Clamp, + Suld2DV2I32Clamp, + Suld2DV2I64Clamp, + Suld2DV4I8Clamp, + Suld2DV4I16Clamp, + Suld2DV4I32Clamp, + + Suld2DArrayI8Clamp, + Suld2DArrayI16Clamp, + Suld2DArrayI32Clamp, + Suld2DArrayI64Clamp, + Suld2DArrayV2I8Clamp, + Suld2DArrayV2I16Clamp, + Suld2DArrayV2I32Clamp, + Suld2DArrayV2I64Clamp, + Suld2DArrayV4I8Clamp, + Suld2DArrayV4I16Clamp, + Suld2DArrayV4I32Clamp, + + Suld3DI8Clamp, + Suld3DI16Clamp, + Suld3DI32Clamp, + Suld3DI64Clamp, + Suld3DV2I8Clamp, + Suld3DV2I16Clamp, + Suld3DV2I32Clamp, + Suld3DV2I64Clamp, + Suld3DV4I8Clamp, + Suld3DV4I16Clamp, + Suld3DV4I32Clamp, + + Suld1DI8Trap, + Suld1DI16Trap, + Suld1DI32Trap, + Suld1DI64Trap, + Suld1DV2I8Trap, + Suld1DV2I16Trap, + Suld1DV2I32Trap, + Suld1DV2I64Trap, + Suld1DV4I8Trap, + Suld1DV4I16Trap, + Suld1DV4I32Trap, + + Suld1DArrayI8Trap, + Suld1DArrayI16Trap, + Suld1DArrayI32Trap, + Suld1DArrayI64Trap, + Suld1DArrayV2I8Trap, + Suld1DArrayV2I16Trap, + Suld1DArrayV2I32Trap, + Suld1DArrayV2I64Trap, + Suld1DArrayV4I8Trap, + Suld1DArrayV4I16Trap, + Suld1DArrayV4I32Trap, + + Suld2DI8Trap, + Suld2DI16Trap, + Suld2DI32Trap, + Suld2DI64Trap, + Suld2DV2I8Trap, + Suld2DV2I16Trap, + Suld2DV2I32Trap, + Suld2DV2I64Trap, + Suld2DV4I8Trap, + Suld2DV4I16Trap, + Suld2DV4I32Trap, + + Suld2DArrayI8Trap, + Suld2DArrayI16Trap, + Suld2DArrayI32Trap, + Suld2DArrayI64Trap, + Suld2DArrayV2I8Trap, + Suld2DArrayV2I16Trap, + Suld2DArrayV2I32Trap, + Suld2DArrayV2I64Trap, + Suld2DArrayV4I8Trap, + Suld2DArrayV4I16Trap, + Suld2DArrayV4I32Trap, + + Suld3DI8Trap, + Suld3DI16Trap, + Suld3DI32Trap, + Suld3DI64Trap, + Suld3DV2I8Trap, + Suld3DV2I16Trap, + Suld3DV2I32Trap, + Suld3DV2I64Trap, + Suld3DV4I8Trap, + Suld3DV4I16Trap, + Suld3DV4I32Trap, + + Suld1DI8Zero, + Suld1DI16Zero, + Suld1DI32Zero, + Suld1DI64Zero, + Suld1DV2I8Zero, + Suld1DV2I16Zero, + Suld1DV2I32Zero, + Suld1DV2I64Zero, + Suld1DV4I8Zero, + Suld1DV4I16Zero, + Suld1DV4I32Zero, + + Suld1DArrayI8Zero, + Suld1DArrayI16Zero, + Suld1DArrayI32Zero, + Suld1DArrayI64Zero, + Suld1DArrayV2I8Zero, + Suld1DArrayV2I16Zero, + Suld1DArrayV2I32Zero, + Suld1DArrayV2I64Zero, + Suld1DArrayV4I8Zero, + Suld1DArrayV4I16Zero, + Suld1DArrayV4I32Zero, + + Suld2DI8Zero, + Suld2DI16Zero, + Suld2DI32Zero, + Suld2DI64Zero, + Suld2DV2I8Zero, + Suld2DV2I16Zero, + Suld2DV2I32Zero, + Suld2DV2I64Zero, + Suld2DV4I8Zero, + Suld2DV4I16Zero, + Suld2DV4I32Zero, + + Suld2DArrayI8Zero, + Suld2DArrayI16Zero, + Suld2DArrayI32Zero, + Suld2DArrayI64Zero, + Suld2DArrayV2I8Zero, + Suld2DArrayV2I16Zero, + Suld2DArrayV2I32Zero, + Suld2DArrayV2I64Zero, + Suld2DArrayV4I8Zero, + Suld2DArrayV4I16Zero, + Suld2DArrayV4I32Zero, + + Suld3DI8Zero, + Suld3DI16Zero, + Suld3DI32Zero, + Suld3DI64Zero, + Suld3DV2I8Zero, + Suld3DV2I16Zero, + Suld3DV2I32Zero, + Suld3DV2I64Zero, + Suld3DV4I8Zero, + Suld3DV4I16Zero, + Suld3DV4I32Zero }; } +class NVPTXSubtarget; + //===--------------------------------------------------------------------===// // TargetLowering Implementation //===--------------------------------------------------------------------===// class NVPTXTargetLowering : public TargetLowering { public: explicit NVPTXTargetLowering(NVPTXTargetMachine &TM); - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset, SelectionDAG &DAG) const; - virtual const char *getTargetNodeName(unsigned Opcode) const; + const char *getTargetNodeName(unsigned Opcode) const override; bool isTypeSupportedInIntrinsic(MVT VT) const; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, - unsigned Intrinsic) const; + unsigned Intrinsic) const override; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type /// Used to guide target specific optimizations, like loop strength /// reduction (LoopStrengthReduce.cpp) and memory optimization for /// address mode (CodeGenPrepare.cpp) - virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const; + bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; + unsigned getFunctionAlignment(const Function *F) const; - virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const { + EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override { if (VT.isVector()) - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); return MVT::i1; } - ConstraintType getConstraintType(const std::string &Constraint) const; + ConstraintType + getConstraintType(const std::string &Constraint) const override; std::pair<unsigned, const TargetRegisterClass *> - getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const; + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT VT) const override; - virtual SDValue LowerFormalArguments( + SDValue LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; + SmallVectorImpl<SDValue> &InVals) const override; - virtual SDValue - LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; std::string getPrototype(Type *, const ArgListTy &, const SmallVectorImpl<ISD::OutputArg> &, unsigned retAlignment, const ImmutableCallSite *CS) const; - virtual SDValue + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, - SelectionDAG &DAG) const; + SelectionDAG &DAG) const override; - virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, - std::vector<SDValue> &Ops, - SelectionDAG &DAG) const; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; NVPTXTargetMachine *nvTM; // PTX always uses 32-bit shift amounts - virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; } + MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; - virtual bool shouldSplitVectorElementType(EVT VT) const; + bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const; + + virtual bool isFMAFasterThanFMulAndFAdd(EVT) const { + return true; + } private: const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here @@ -160,8 +526,12 @@ private: SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; - virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, Type *Ty, unsigned Idx) const; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp new file mode 100644 index 000000000000..a98fb37f6e25 --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -0,0 +1,178 @@ +//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR-level optimizations of image access code, +// including: +// +// 1. Eliminate istypep intrinsics when image access qualifier is known +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" + +using namespace llvm; + +namespace { +class NVPTXImageOptimizer : public FunctionPass { +private: + static char ID; + SmallVector<Instruction*, 4> InstrToDelete; + +public: + NVPTXImageOptimizer(); + + bool runOnFunction(Function &F) override; + +private: + bool replaceIsTypePSampler(Instruction &I); + bool replaceIsTypePSurface(Instruction &I); + bool replaceIsTypePTexture(Instruction &I); + Value *cleanupValue(Value *V); + void replaceWith(Instruction *From, ConstantInt *To); +}; +} + +char NVPTXImageOptimizer::ID = 0; + +NVPTXImageOptimizer::NVPTXImageOptimizer() + : FunctionPass(ID) {} + +bool NVPTXImageOptimizer::runOnFunction(Function &F) { + bool Changed = false; + InstrToDelete.clear(); + + // Look for call instructions in the function + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; + ++BI) { + for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end(); + I != E; ++I) { + Instruction &Instr = *I; + if (CallInst *CI = dyn_cast<CallInst>(I)) { + Function *CalledF = CI->getCalledFunction(); + if (CalledF && CalledF->isIntrinsic()) { + // This is an intrinsic function call, check if its an istypep + switch (CalledF->getIntrinsicID()) { + default: break; + case Intrinsic::nvvm_istypep_sampler: + Changed |= replaceIsTypePSampler(Instr); + break; + case Intrinsic::nvvm_istypep_surface: + Changed |= replaceIsTypePSurface(Instr); + break; + case Intrinsic::nvvm_istypep_texture: + Changed |= replaceIsTypePTexture(Instr); + break; + } + } + } + } + } + + // Delete any istypep instances we replaced in the IR + for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i) + InstrToDelete[i]->eraseFromParent(); + + return Changed; +} + +bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) { + Value *TexHandle = cleanupValue(I.getOperand(0)); + if (isSampler(*TexHandle)) { + // This is an OpenCL sampler, so it must be a samplerref + replaceWith(&I, ConstantInt::getTrue(I.getContext())); + return true; + } else if (isImageWriteOnly(*TexHandle) || + isImageReadWrite(*TexHandle) || + isImageReadOnly(*TexHandle)) { + // This is an OpenCL image, so it cannot be a samplerref + replaceWith(&I, ConstantInt::getFalse(I.getContext())); + return true; + } else { + // The image type is unknown, so we cannot eliminate the intrinsic + return false; + } +} + +bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) { + Value *TexHandle = cleanupValue(I.getOperand(0)); + if (isImageReadWrite(*TexHandle) || + isImageWriteOnly(*TexHandle)) { + // This is an OpenCL read-only/read-write image, so it must be a surfref + replaceWith(&I, ConstantInt::getTrue(I.getContext())); + return true; + } else if (isImageReadOnly(*TexHandle) || + isSampler(*TexHandle)) { + // This is an OpenCL read-only/ imageor sampler, so it cannot be + // a surfref + replaceWith(&I, ConstantInt::getFalse(I.getContext())); + return true; + } else { + // The image type is unknown, so we cannot eliminate the intrinsic + return false; + } +} + +bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) { + Value *TexHandle = cleanupValue(I.getOperand(0)); + if (isImageReadOnly(*TexHandle)) { + // This is an OpenCL read-only image, so it must be a texref + replaceWith(&I, ConstantInt::getTrue(I.getContext())); + return true; + } else if (isImageWriteOnly(*TexHandle) || + isImageReadWrite(*TexHandle) || + isSampler(*TexHandle)) { + // This is an OpenCL read-write/write-only image or a sampler, so it + // cannot be a texref + replaceWith(&I, ConstantInt::getFalse(I.getContext())); + return true; + } else { + // The image type is unknown, so we cannot eliminate the intrinsic + return false; + } +} + +void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) { + // We implement "poor man's DCE" here to make sure any code that is no longer + // live is actually unreachable and can be trivially eliminated by the + // unreachable block elimination pass. + for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE; ++UI) { + if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) { + if (BI->isUnconditional()) continue; + BasicBlock *Dest; + if (To->isZero()) + // Get false block + Dest = BI->getSuccessor(1); + else + // Get true block + Dest = BI->getSuccessor(0); + BranchInst::Create(Dest, BI); + InstrToDelete.push_back(BI); + } + } + From->replaceAllUsesWith(To); + InstrToDelete.push_back(From); +} + +Value *NVPTXImageOptimizer::cleanupValue(Value *V) { + if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) { + return cleanupValue(EVI->getAggregateOperand()); + } + return V; +} + +FunctionPass *llvm::createNVPTXImageOptimizerPass() { + return new NVPTXImageOptimizer(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td index f11f1b8f96fc..ffcb5d5273a2 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td @@ -36,8 +36,24 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern> bit IsLoad = 0; bit IsStore = 0; - let TSFlags{3-0} = VecInstType; - let TSFlags{4-4} = IsSimpleMove; - let TSFlags{5-5} = IsLoad; - let TSFlags{6-6} = IsStore; + bit IsTex = 0; + bit IsSust = 0; + bit IsSurfTexQuery = 0; + bit IsTexModeUnified = 0; + + // The following field is encoded as log2 of the vector size minus one, + // with 0 meaning the operation is not a surface instruction. For example, + // if IsSuld == 2, then the instruction is a suld instruction with vector size + // 2**(2-1) = 2. + bits<2> IsSuld = 0; + + let TSFlags{3-0} = VecInstType; + let TSFlags{4-4} = IsSimpleMove; + let TSFlags{5-5} = IsLoad; + let TSFlags{6-6} = IsStore; + let TSFlags{7} = IsTex; + let TSFlags{9-8} = IsSuld; + let TSFlags{10} = IsSust; + let TSFlags{11} = IsSurfTexQuery; + let TSFlags{12} = IsTexModeUnified; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 86ddd3817709..b5b4fbed0799 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -14,8 +14,6 @@ #include "NVPTX.h" #include "NVPTXInstrInfo.h" #include "NVPTXTargetMachine.h" -#define GET_INSTRINFO_CTOR_DTOR -#include "NVPTXGenInstrInfo.inc" #include "llvm/IR/Function.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFunction.h" @@ -24,12 +22,15 @@ using namespace llvm; +#define GET_INSTRINFO_CTOR_DTOR +#include "NVPTXGenInstrInfo.inc" + // Pin the vtable to this file. void NVPTXInstrInfo::anchor() {} // FIXME: Add the subtarget support on this constructor. -NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) - : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {} +NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI) + : NVPTXGenInstrInfo(), RegInfo(STI) {} void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -256,7 +257,7 @@ unsigned NVPTXInstrInfo::InsertBranch( "NVPTX branch conditions have two components!"); // One-way branch. - if (FBB == 0) { + if (!FBB) { if (Cond.empty()) // Unconditional branch BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB); else // Conditional branch diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 600fc5c60a1b..2ac29748676a 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -24,13 +24,12 @@ namespace llvm { class NVPTXInstrInfo : public NVPTXGenInstrInfo { - NVPTXTargetMachine &TM; const NVPTXRegisterInfo RegInfo; virtual void anchor(); public: - explicit NVPTXInstrInfo(NVPTXTargetMachine &TM); + explicit NVPTXInstrInfo(NVPTXSubtarget &STI); - virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } + const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } /* The following virtual functions are used in register allocation. * They are not implemented because the existing interface and the logic @@ -50,9 +49,9 @@ public: * const TargetRegisterClass *RC) const; */ - virtual void copyPhysReg( + void copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, bool KillSrc) const; + unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DestReg) const; bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const; @@ -61,13 +60,13 @@ public: virtual bool CanTailMerge(const MachineInstr *MI) const; // Branch analysis. - virtual bool AnalyzeBranch( + bool AnalyzeBranch( MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const; - virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; - virtual unsigned InsertBranch( + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const; + const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override; unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const { return MI.getOperand(2).getImm(); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b23f1e4b458d..9900b8c8433f 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,17 +139,10 @@ def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; -def doFMAF32 : Predicate<"doFMAF32">; -def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">; -def doFMAF32AGG : Predicate<"doFMAF32AGG">; -def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">; -def doFMAF64 : Predicate<"doFMAF64">; -def doFMAF64AGG : Predicate<"doFMAF64AGG">; - def doMulWide : Predicate<"doMulWide">; -def allowFMA : Predicate<"allowFMA">; -def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">; +def allowFMA : Predicate<"allowFMA()">; +def noFMA : Predicate<"!allowFMA()">; def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; @@ -158,9 +151,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">; def true : Predicate<"1">; +def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">; + //===----------------------------------------------------------------------===// // Some Common Instruction Class Templates @@ -219,13 +215,13 @@ multiclass F3<string OpcStr, SDNode OpNode> { !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA_ftz]>; + Requires<[allowFMA, doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA_ftz]>; + Requires<[allowFMA, doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), @@ -245,34 +241,38 @@ multiclass F3_rn<string OpcStr, SDNode OpNode> { (ins Float64Regs:$a, Float64Regs:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, Float64Regs:$b))]>; + (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; def f64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), [(set Float64Regs:$dst, - (OpNode Float64Regs:$a, fpimm:$b))]>; + (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ]>; + Requires<[noFMA, doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ]>; + Requires<[noFMA, doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, Float32Regs:$b))]>; + (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; def f32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, - (OpNode Float32Regs:$a, fpimm:$b))]>; + (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; } multiclass F2<string OpcStr, SDNode OpNode> { @@ -461,33 +461,45 @@ def SHL2MUL16 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(temp.shl(v), MVT::i16); }]>; -def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDES64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDES64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDEU64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDEU64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), +def MULWIDES32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDES32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDEU32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, @@ -507,25 +519,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>, + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>, + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>, + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>, + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + + +def SDTMulWide + : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed + : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned + : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, Requires<[doMulWide]>; defm MULT : I3<"mul.lo.s", mul>; @@ -541,69 +591,75 @@ defm SREM : I3<"rem.s", srem>; defm UREM : I3<"rem.u", urem>; // The ri version will not be selected as DAGCombiner::visitUREM will lower it. +def SDTIMAD + : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, + SDTCisInt<2>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def imad + : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b), - imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, imm:$c))]>; def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, imm:$c))]>; def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), imm:$c))]>; - + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, imm:$c))]>; def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", @@ -689,12 +745,24 @@ def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst), [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.approx.f32 \t$dst, $a, $b;", [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, Requires<[do_DIVF32_APPROX]>; +def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX]>; // // F32 Semi-accurate reciprocal // @@ -797,36 +865,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), - Float32Regs:$c))]>, Requires<[Pred]>; - // This is to WAR a weird bug in Tablegen that does not automatically - // generate the following permutated rule rrr2 from the above rrr. - // So we explicitly add it here. This happens to FMA32 only. - // See the comments at FMAD32 and FMA32 for more information. - def rrr2 : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd Float32Regs:$c, - (fmul Float32Regs:$a, Float32Regs:$b)))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>, Requires<[Pred]>; def rir : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } @@ -834,73 +892,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, Float64Regs:$b), - Float64Regs:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>, + Requires<[Pred]>; def rri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a, - Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>, + Requires<[Pred]>; def rir : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } -// Due to a unknown reason (most likely a bug in tablegen), tablegen does not -// automatically generate the rrr2 rule from -// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32. -// If we reverse the order of the following two lines, then rrr2 rule will be -// generated for FMA32, but not for rrr. -// Therefore, we manually write the rrr2 rule in FPCONTRACT32. -defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; -defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; -defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; - -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c -// b*c-a => fmad(b, c, -a) -// - legal because b*c-a <=> b*c+(-a) -multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)), - (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>, - Requires<[Pred]>; - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)), - (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a), - (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>, - Requires<[Pred]>; -} - -defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>; -defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>; -defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>; +defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>; +defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>; +defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>; def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", @@ -1071,6 +1088,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> { defm SRA : RSHIFT_FORMAT<"shr.s", sra>; defm SRL : RSHIFT_FORMAT<"shr.u", srl>; +// +// Rotate: use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// +// Rotate: if ptx shf instruction is not available, then use shift+add +// // 32bit def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), @@ -1088,9 +1142,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{ }]>; def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>; + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>; + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1103,7 +1159,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1116,7 +1173,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; // 64bit def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, @@ -1165,6 +1223,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, !strconcat("}}", ""))))))))), [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; +// BFE - bit-field extract + +multiclass BFE<string TyStr, RegisterClass RC> { + // BFE supports both 32-bit and 64-bit values, but the start and length + // operands are always 32-bit + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +defm BFE_S32 : BFE<"s32", Int32Regs>; +defm BFE_U32 : BFE<"u32", Int32Regs>; +defm BFE_S64 : BFE<"s64", Int64Regs>; +defm BFE_U64 : BFE<"u64", Int64Regs>; //----------------------------------- // General Comparison @@ -1280,6 +1361,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; +// +// Funnnel shift in clamp mode +// +// - SDNodes are created so they can be used in the DAG code, +// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +// +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- @@ -1807,7 +1914,7 @@ def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2, Int32Regs:$val3, Int32Regs:$val4, i32imm:$a, i32imm:$b), - "st.param.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", + "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", []>; def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 14049b1cf8eb..14e51aa309ea 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -792,13 +792,18 @@ def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t", "}}")))), Float32Regs, Int16Regs, int_nvvm_h2f>; -def : Pat<(f32 (f16_to_f32 Int16Regs:$a)), +def : Pat<(f32 (f16_to_fp Int16Regs:$a)), (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i16 (f32_to_f16 Float32Regs:$a)), +def : Pat<(i16 (fp_to_f16 Float32Regs:$a)), (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (f32_to_f16 Float32Regs:$a)), +def : Pat<(i16 (fp_to_f16 Float32Regs:$a)), (CVT_f16_f32 Float32Regs:$a, CvtRN)>; +def : Pat<(f64 (f16_to_fp Int16Regs:$a)), + (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i16 (fp_to_f16 Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN)>; + // // Bitcast // @@ -1057,12 +1062,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>; @@ -1072,6 +1089,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max", + atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1080,6 +1105,14 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max", + atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_min @@ -1089,12 +1122,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>; @@ -1104,6 +1149,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min", + atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1112,6 +1165,14 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min", + atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_inc atom_dec @@ -1153,6 +1214,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>; @@ -1162,6 +1229,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and", + atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and", + atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and", + atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_or @@ -1171,6 +1246,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>; @@ -1180,6 +1261,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or", atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or", + atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or", + atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>; +defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or", + atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>; // atom_xor @@ -1189,6 +1278,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>; @@ -1198,6 +1293,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor", + atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor", + atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor", + atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_cas @@ -1276,67 +1379,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, // Support for ldu on sm_20 or later //----------------------------------- -def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - // Scalar -// @TODO: Revisit this, Changed imemAny to imem -multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDU_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; } -multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; -} - -defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, - ldu_i8>; -defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_p>; -defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_p>; +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; +defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>; +defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>; +defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1406,65 +1475,40 @@ defm INT_PTX_LDU_G_v4f32_ELE // Support for ldg on sm_35 or later //----------------------------------- -def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - -multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDG_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; -} - -multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; } defm INT_PTX_LDG_GLOBAL_i8 - : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>; + : LDG_G<"u8 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i16 - : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u16 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_i64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; defm INT_PTX_LDG_GLOBAL_f32 - : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f32 \t$result, [$src];", Float32Regs>; defm INT_PTX_LDG_GLOBAL_f64 - : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f64 \t$result, [$src];", Float64Regs>; defm INT_PTX_LDG_GLOBAL_p32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_p64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1666,6 +1710,9 @@ def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen (MoveParam texternalsym:$src)))), (nvvm_move_ptr32 texternalsym:$src)>; +def texsurf_handles + : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src), + "mov.u64 \t$result, $src;", []>; //----------------------------------- // Compiler Error Warn @@ -1686,6 +1733,5224 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), [(int_nvvm_compiler_error Int64Regs:$a)]>; +// isspacep + +def ISSPACEP_CONST_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_CONST_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_GLOBAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>; +def ISSPACEP_GLOBAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>; +def ISSPACEP_LOCAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>; +def ISSPACEP_LOCAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>; +def ISSPACEP_SHARED_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>; +def ISSPACEP_SHARED_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>; + + +// Special register reads +def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d), + (ins SpecialRegs:$r), + "mov.b32\t$d, $r;", []>; + +def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; + + +// rotate builtin support + +def ROTATE_B32_HW_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTATE_B32_HW_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), + (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]> ; + +def GET_LO_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def GET_HI_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; + +def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), + (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src))> ; + +// funnel shift, requires >= sm_32 +def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +// HW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt), + (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt), + (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + +// SW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; + + +//----------------------------------- +// Texture Intrinsics +//----------------------------------- + +// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be +// also defined in NVPTXReplaceImageHandles.cpp + +// texmode_independent +let IsTex = 1, IsTexModeUnified = 0 in { +// Texture fetch instructions using handles +def TEX_1D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), + "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x), + "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod), + "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], $lod;", + []>; +def TEX_1D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_1D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), + "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x), + "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], $lod;", + []>; +def TEX_1D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_1D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x), + "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x), + "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];", + []>; +def TEX_1D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], $lod;", + []>; +def TEX_1D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_1D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], $lod;", + []>; +def TEX_1D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_1D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], $lod;", + []>; +def TEX_1D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_1D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}];", + []>; +def TEX_1D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], $lod;", + []>; +def TEX_1D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_2D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], $lod;", + []>; +def TEX_2D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_2D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], $lod;", + []>; +def TEX_2D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_2D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TEX_2D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], $lod;", + []>; +def TEX_2D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_2D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_2D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_2D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_2D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_2D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_2D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_2D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_3D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_3D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_3D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_3D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_3D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_3D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_3D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; + +def TEX_CUBE_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_CUBE_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_CUBE_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_CUBE_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_CUBE_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_CUBE_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;", + []>; + +def TEX_CUBE_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_CUBE_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_CUBE_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_CUBE_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_CUBE_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_CUBE_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;", + []>; + +def TLD4_R_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_R_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_R_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_G_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_B_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +def TLD4_A_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, $s, \\{$x, $y\\}];", + []>; +} + + +// texmode_unified +let IsTex = 1, IsTexModeUnified = 1 in { +// Texture fetch instructions using handles +def TEX_UNIFIED_1D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod), + "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x), + "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x), + "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];", + []>; +def TEX_UNIFIED_1D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_UNIFIED_1D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x), + "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x), + "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}];", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$lod), + "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], $lod;", + []>; +def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$gradx, Float32Regs:$grady), + "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};", + []>; + +def TEX_UNIFIED_2D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y), + "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TEX_UNIFIED_2D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$lod), + "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_UNIFIED_2D_ARRAY_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$y), + "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y), + "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}];", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, Float32Regs:$lod), + "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], $lod;", + []>; +def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x, + Float32Regs:$y, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$grady0, Float32Regs:$grady1), + "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, " + "\\{$grady0, $grady1\\};", + []>; + +def TEX_UNIFIED_3D_F32_S32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_F32_F32_GRAD + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_UNIFIED_3D_S32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_S32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; +def TEX_UNIFIED_3D_U32_S32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$z), + "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z), + "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_3D_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, Float32Regs:$lod), + "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_3D_U32_F32_GRAD + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y, + Float32Regs:$z, + Float32Regs:$gradx0, Float32Regs:$gradx1, + Float32Regs:$gradx2, Float32Regs:$grady0, + Float32Regs:$grady1, Float32Regs:$grady2), + "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], " + "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, " + "\\{$grady0, $grady1, $grady2, $grady2\\};", + []>; + +def TEX_UNIFIED_CUBE_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$x, $y, $z, $z\\}], $lod;", + []>; + +def TEX_UNIFIED_CUBE_ARRAY_F32_F32 + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL + : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g, + Float32Regs:$b, Float32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_ARRAY_S32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; +def TEX_UNIFIED_CUBE_ARRAY_U32_F32 + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z), + "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}];", + []>; +def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$t, Int32Regs:$l, + Float32Regs:$x, Float32Regs:$y, Float32Regs:$z, + Float32Regs:$lod), + "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, " + "[$t, \\{$l, $x, $y, $z\\}], $lod;", + []>; + +def TLD4_UNIFIED_R_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_F32_F32 + : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1, + Float32Regs:$v2, Float32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_R_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_S32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_R_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_G_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_B_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +def TLD4_UNIFIED_A_2D_U32_F32 + : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1, + Int32Regs:$v2, Int32Regs:$v3), + (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y), + "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, " + "[$t, \\{$x, $y\\}];", + []>; +} + + + +//=== Surface load instructions +// .clamp variant +let IsSuld = 1 in { +def SULD_1D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_CLAMP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_CLAMP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_CLAMP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + + +// .trap variant +let IsSuld = 1 in { +def SULD_1D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_TRAP + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_TRAP + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_TRAP + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_TRAP + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_TRAP + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_TRAP + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +// .zero variant +let IsSuld = 1 in { +def SULD_1D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_I8_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I16_ZERO + : NVPTXInst<(outs Int16Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I32_ZERO + : NVPTXInst<(outs Int32Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_I64_ZERO + : NVPTXInst<(outs Int64Regs:$r), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 2 in { +def SULD_1D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + +def SULD_3D_V2I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V2I64_ZERO + : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +let IsSuld = 3 in { +def SULD_1D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; +def SULD_1D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x), + "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];", + []>; + +def SULD_1D_ARRAY_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; +def SULD_1D_ARRAY_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x), + "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x\\}];", + []>; + +def SULD_2D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; +def SULD_2D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y), + "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];", + []>; + +def SULD_2D_ARRAY_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; +def SULD_2D_ARRAY_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y), + "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$l, $x, $y, $y\\}];", + []>; + + +def SULD_3D_V4I8_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I16_ZERO + : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +def SULD_3D_V4I32_ZERO + : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z), + "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, " + "[$s, \\{$x, $y, $z, $z\\}];", + []>; +} + +//----------------------------------- +// Texture Query Intrinsics +//----------------------------------- + +let IsSurfTexQuery = 1 in { +def TXQ_CHANNEL_ORDER + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.channel_order.b32 \t$d, [$a];", + []>; +def TXQ_CHANNEL_DATA_TYPE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.channel_data_type.b32 \t$d, [$a];", + []>; +def TXQ_WIDTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.width.b32 \t$d, [$a];", + []>; +def TXQ_HEIGHT + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.height.b32 \t$d, [$a];", + []>; +def TXQ_DEPTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.depth.b32 \t$d, [$a];", + []>; +def TXQ_ARRAY_SIZE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.array_size.b32 \t$d, [$a];", + []>; +def TXQ_NUM_SAMPLES + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.num_samples.b32 \t$d, [$a];", + []>; +def TXQ_NUM_MIPMAP_LEVELS + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq.num_mipmap_levels.b32 \t$d, [$a];", + []>; +} + +def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a), + (TXQ_CHANNEL_ORDER Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a), + (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_width Int64Regs:$a), + (TXQ_WIDTH Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_height Int64Regs:$a), + (TXQ_HEIGHT Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_depth Int64Regs:$a), + (TXQ_DEPTH Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_array_size Int64Regs:$a), + (TXQ_ARRAY_SIZE Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a), + (TXQ_NUM_SAMPLES Int64Regs:$a)>; +def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a), + (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>; + + +//----------------------------------- +// Surface Query Intrinsics +//----------------------------------- + +let IsSurfTexQuery = 1 in { +def SUQ_CHANNEL_ORDER + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.channel_order.b32 \t$d, [$a];", + []>; +def SUQ_CHANNEL_DATA_TYPE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.channel_data_type.b32 \t$d, [$a];", + []>; +def SUQ_WIDTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.width.b32 \t$d, [$a];", + []>; +def SUQ_HEIGHT + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.height.b32 \t$d, [$a];", + []>; +def SUQ_DEPTH + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.depth.b32 \t$d, [$a];", + []>; +def SUQ_ARRAY_SIZE + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq.array_size.b32 \t$d, [$a];", + []>; +} + +def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a), + (SUQ_CHANNEL_ORDER Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a), + (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_width Int64Regs:$a), + (SUQ_WIDTH Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_height Int64Regs:$a), + (SUQ_HEIGHT Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_depth Int64Regs:$a), + (SUQ_DEPTH Int64Regs:$a)>; +def : Pat<(int_nvvm_suq_array_size Int64Regs:$a), + (SUQ_ARRAY_SIZE Int64Regs:$a)>; + + +//===- Handle Query -------------------------------------------------------===// + +// TODO: These intrinsics are not yet finalized, pending PTX ISA design work +def ISTYPEP_SAMPLER + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.samplerref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>; +def ISTYPEP_SURFACE + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.surfref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>; +def ISTYPEP_TEXTURE + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.texref \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>; + +//===- Surface Stores -----------------------------------------------------===// + +let IsSust = 1 in { +// Unformatted +// .clamp variant +def SUST_B_1D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_1D_ARRAY_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_ARRAY_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_3D_B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_V2B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B64_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V4B8_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B16_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B32_CLAMP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +// .trap variant +def SUST_B_1D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_1D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_3D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B64_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +// .zero variant +def SUST_B_1D_B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_B_1D_V2B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_V4B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_V4B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_1D_ARRAY_B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r), + "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_B_1D_ARRAY_V2B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r, + Int64Regs:$g), + "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_B_1D_ARRAY_V4B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_1D_ARRAY_V4B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_V2B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_B_2D_V4B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_V4B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_2D_ARRAY_B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r), + "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_B_2D_ARRAY_V2B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g), + "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_2D_ARRAY_V4B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_2D_ARRAY_V4B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_B_3D_B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_B_3D_V2B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V2B64_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_B_3D_V4B8_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B16_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_B_3D_V4B32_ZERO + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + + +// Formatted + +def SUST_P_1D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_P_1D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_P_1D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};", + []>; +def SUST_P_1D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_P_1D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g, + Int16Regs:$b, Int16Regs:$a), + "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; +def SUST_P_1D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g, + Int32Regs:$b, Int32Regs:$a), + "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", + []>; + + +def SUST_P_1D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_P_1D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r), + "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_P_1D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r), + "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};", + []>; +def SUST_P_1D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g), + "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g), + "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", + []>; +def SUST_P_1D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_1D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_1D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_P_2D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_P_2D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g), + "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_P_2D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", + []>; +def SUST_P_2D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_2D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, + Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_2D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_P_2D_ARRAY_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_ARRAY_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r), + "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_ARRAY_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r), + "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", + []>; +def SUST_P_2D_ARRAY_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_2D_ARRAY_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_2D_ARRAY_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g), + "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_2D_ARRAY_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_2D_ARRAY_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_2D_ARRAY_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], " + "\\{$r, $g, $b, $a\\};", + []>; + + +def SUST_P_3D_B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_P_3D_B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_P_3D_B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", + []>; +def SUST_P_3D_V2B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_3D_V2B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_3D_V2B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g\\};", + []>; +def SUST_P_3D_V4B8_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_3D_V4B16_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +def SUST_P_3D_V4B32_TRAP + : NVPTXInst<(outs), + (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], " + "\\{$r, $g, $b, $a\\};", + []>; +} + +// Surface store instruction patterns +// I'm not sure why we can't just include these in the instruction definitions, +// but TableGen complains of type errors :( + +// .clamp variant +def : Pat<(int_nvvm_sust_b_1d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_3d_i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_B_3D_B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_B_3D_V2B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B8_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B16_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_3D_V4B32_CLAMP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + +// .trap variant +def : Pat<(int_nvvm_sust_b_1d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_1d_array_i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_array_i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_3d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_B_3D_B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_B_3D_V2B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i64_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_3D_V4B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + +// .zero variant +def : Pat<(int_nvvm_sust_b_1d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_1d_array_i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), + (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), + (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), + (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_2d_array_i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), + (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, + Int64Regs:$g), + (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_b_3d_i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_B_3D_B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_B_3D_B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r), + (SUST_B_3D_B64_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_B_3D_V2B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_B_3D_V2B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v2i64_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g), + (SUST_B_3D_V2B64_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int64Regs:$r, Int64Regs:$g)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i8_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B8_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i16_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_B_3D_V4B16_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_b_3d_v4i32_zero + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_B_3D_V4B32_ZERO Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + + +def : Pat<(int_nvvm_sust_p_1d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), + (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), + (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_1d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_1d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_p_1d_array_i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_array_i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), + (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_array_i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), + (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), + (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), + (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_p_2d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), + (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), + (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_2d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_2d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_p_2d_array_i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_array_i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), + (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_array_i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), + (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g), + (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, + Int32Regs:$g), + (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s, + Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap + Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, + Int32Regs:$x, Int32Regs:$y, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + + +def : Pat<(int_nvvm_sust_p_3d_i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_P_3D_B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_3d_i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r), + (SUST_P_3D_B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_3d_i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r), + (SUST_P_3D_B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r)>; + +def : Pat<(int_nvvm_sust_p_3d_v2i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_P_3D_V2B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_3d_v2i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g), + (SUST_P_3D_V2B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_3d_v2i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g), + (SUST_P_3D_V2B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g)>; + +def : Pat<(int_nvvm_sust_p_3d_v4i8_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_3D_V4B8_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_3d_v4i16_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), + (SUST_P_3D_V4B16_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; + +def : Pat<(int_nvvm_sust_p_3d_v4i32_trap + Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), + (SUST_P_3D_V4B32_TRAP Int64Regs:$s, + Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, + Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; + + //===-- Old PTX Back-end Intrinsics ---------------------------------------===// diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 7c257b4c6a89..f0c3663551bc 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -16,12 +16,12 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/Support/InstIterator.h" using namespace llvm; @@ -104,7 +104,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { SmallVector<MemTransferInst *, 4> aggrMemcpys; SmallVector<MemSetInst *, 4> aggrMemsets; - DataLayout *TD = &getAnalysis<DataLayout>(); + const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); // @@ -120,10 +120,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { if (load->hasOneUse() == false) continue; - if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize) + if (DL->getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue; - User *use = *(load->use_begin()); + User *use = load->user_back(); if (StoreInst *store = dyn_cast<StoreInst>(use)) { if (store->getOperand(0) != load) //getValueOperand continue; @@ -163,10 +163,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) { LoadInst *load = aggrLoads[i]; - StoreInst *store = dyn_cast<StoreInst>(*load->use_begin()); + StoreInst *store = dyn_cast<StoreInst>(*load->user_begin()); Value *srcAddr = load->getOperand(0); Value *dstAddr = store->getOperand(1); - unsigned numLoads = TD->getTypeStoreSize(load->getType()); + unsigned numLoads = DL->getTypeStoreSize(load->getType()); Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads); convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(), diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h index 286e753fa92b..5ec1fc969687 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h @@ -27,16 +27,17 @@ struct NVPTXLowerAggrCopies : public FunctionPass { NVPTXLowerAggrCopies() : FunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DataLayout>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DataLayoutPass>(); + AU.addPreserved("stack-protector"); AU.addPreserved<MachineFunctionAnalysis>(); } - virtual bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; static const unsigned MaxAggrCopySize = 128; - virtual const char *getPassName() const { + const char *getPassName() const override { return "Lower aggregate copies/intrinsics into loops"; } }; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp index ca247642048a..137248b19a68 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp @@ -7,13 +7,14 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "nvptx-mcexpr" #include "NVPTXMCExpr.h" #include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" using namespace llvm; +#define DEBUG_TYPE "nvptx-mcexpr" + const NVPTXFloatMCExpr* NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) { return new (Ctx) NVPTXFloatMCExpr(Kind, Flt); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h index 0efb231d7d2e..554764930a9e 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h @@ -61,18 +61,18 @@ public: /// @} - void PrintImpl(raw_ostream &OS) const; + void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout) const { + const MCAsmLayout *Layout) const override { return false; } - void AddValueSymbols(MCAssembler *) const {}; - const MCSection *FindAssociatedSection() const { - return NULL; + void visitUsedExpr(MCStreamer &Streamer) const override {}; + const MCSection *FindAssociatedSection() const override { + return nullptr; } // There are no TLS NVPTXMCExprs at the moment. - void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {} + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} static bool classof(const MCExpr *E) { return E->getKind() == MCExpr::Target; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h new file mode 100644 index 000000000000..67fb39050797 --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h @@ -0,0 +1,46 @@ +//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is attached to a MachineFunction instance and tracks target- +// dependent information +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { +class NVPTXMachineFunctionInfo : public MachineFunctionInfo { +private: + /// Stores a mapping from index to symbol name for removing image handles + /// on Fermi. + SmallVector<std::string, 8> ImageHandleList; + +public: + NVPTXMachineFunctionInfo(MachineFunction &MF) {} + + /// Returns the index for the symbol \p Symbol. If the symbol was previously, + /// added, the same index is returned. Otherwise, the symbol is added and the + /// new index is returned. + unsigned getImageHandleSymbolIndex(const char *Symbol) { + // Is the symbol already present? + for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i) + if (ImageHandleList[i] == std::string(Symbol)) + return i; + // Nope, insert it + ImageHandleList.push_back(Symbol); + return ImageHandleList.size()-1; + } + + /// Returns the symbol name at the given index. + const char *getImageHandleSymbol(unsigned Idx) const { + assert(ImageHandleList.size() > Idx && "Bad index"); + return ImageHandleList[Idx].c_str(); + } +}; +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 843ebed5e4a4..348ab0c4bf14 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -14,24 +14,26 @@ //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "llvm/Pass.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; +#define DEBUG_TYPE "nvptx-prolog-epilog" + namespace { class NVPTXPrologEpilogPass : public MachineFunctionPass { public: static char ID; NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {} - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; private: void calculateFrameObjectOffsets(MachineFunction &Fn); @@ -58,7 +60,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (!MI->getOperand(i).isFI()) continue; - TRI.eliminateFrameIndex(MI, 0, i, NULL); + TRI.eliminateFrameIndex(MI, 0, i, nullptr); Modified = true; } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 4d3a1d9b40ce..358ccce39818 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -11,8 +11,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "nvptx-reg-info" - #include "NVPTXRegisterInfo.h" #include "NVPTX.h" #include "NVPTXSubtarget.h" @@ -25,6 +23,8 @@ using namespace llvm; +#define DEBUG_TYPE "nvptx-reg-info" + namespace llvm { std::string getNVPTXRegClassName(TargetRegisterClass const *RC) { if (RC == &NVPTX::Float32RegsRegClass) { @@ -53,9 +53,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; } if (RC == &NVPTX::Float64RegsRegClass) { - return "%fl"; + return "%fd"; } else if (RC == &NVPTX::Int64RegsRegClass) { - return "%rl"; + return "%rd"; } else if (RC == &NVPTX::Int32RegsRegClass) { return "%r"; } else if (RC == &NVPTX::Int16RegsRegClass) { @@ -78,19 +78,12 @@ NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st) #include "NVPTXGenRegisterInfo.inc" /// NVPTX Callee Saved Registers -const uint16_t * +const MCPhysReg * NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - static const uint16_t CalleeSavedRegs[] = { 0 }; + static const MCPhysReg CalleeSavedRegs[] = { 0 }; return CalleeSavedRegs; } -// NVPTX Callee Saved Reg Classes -const TargetRegisterClass *const * -NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { - static const TargetRegisterClass *const CalleeSavedRegClasses[] = { 0 }; - return CalleeSavedRegClasses; -} - BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); return Reserved; @@ -113,12 +106,6 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); } -int NVPTXRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { - return 0; -} - unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return NVPTX::VRFrame; } - -unsigned NVPTXRegisterInfo::getRARegister() const { return 0; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h index 0a20f29ccb1e..a7594be121a0 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -16,11 +16,10 @@ #include "ManagedStringPool.h" #include "llvm/Target/TargetRegisterInfo.h" +#include <sstream> #define GET_REGINFO_HEADER #include "NVPTXGenRegisterInfo.inc" -#include "llvm/Target/TargetRegisterInfo.h" -#include <sstream> namespace llvm { @@ -42,22 +41,16 @@ public: //------------------------------------------------------ // NVPTX callee saved registers - virtual const uint16_t * - getCalleeSavedRegs(const MachineFunction *MF = 0) const; - - // NVPTX callee saved register classes - virtual const TargetRegisterClass *const * - getCalleeSavedRegClasses(const MachineFunction *MF) const; + const MCPhysReg * + getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; - virtual BitVector getReservedRegs(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; - virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS = NULL) const; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; - virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const; - virtual unsigned getFrameRegister(const MachineFunction &MF) const; - virtual unsigned getRARegister() const; + unsigned getFrameRegister(const MachineFunction &MF) const override; ManagedStringPool *getStrPool() const { return const_cast<ManagedStringPool *>(&ManagedStrPool); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index 7a38a66b9227..efcee6b6f2bd 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -35,9 +35,9 @@ foreach i = 0-4 in { def P#i : NVPTXReg<"%p"#i>; // Predicate def RS#i : NVPTXReg<"%rs"#i>; // 16-bit def R#i : NVPTXReg<"%r"#i>; // 32-bit - def RL#i : NVPTXReg<"%rl"#i>; // 64-bit + def RL#i : NVPTXReg<"%rd"#i>; // 64-bit def F#i : NVPTXReg<"%f"#i>; // 32-bit float - def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float + def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float // Arguments def ia#i : NVPTXReg<"%ia"#i>; @@ -46,6 +46,10 @@ foreach i = 0-4 in { def da#i : NVPTXReg<"%da"#i>; } +foreach i = 0-31 in { + def ENVREG#i : NVPTXReg<"%envreg"#i>; +} + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// @@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>; def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, + (sequence "ENVREG%u", 0, 31))>; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp new file mode 100644 index 000000000000..20d4e272341e --- /dev/null +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -0,0 +1,189 @@ +//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// On Fermi, image handles are not supported. To work around this, we traverse +// the machine code and replace image handles with concrete symbols. For this +// to work reliably, inlining of all function call must be performed. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXMachineFunctionInfo.h" +#include "NVPTXSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseSet.h" + +using namespace llvm; + +namespace { +class NVPTXReplaceImageHandles : public MachineFunctionPass { +private: + static char ID; + DenseSet<MachineInstr *> InstrsToRemove; + +public: + NVPTXReplaceImageHandles(); + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "NVPTX Replace Image Handles"; + } +private: + bool processInstr(MachineInstr &MI); + void replaceImageHandle(MachineOperand &Op, MachineFunction &MF); + bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF, + unsigned &Idx); +}; +} + +char NVPTXReplaceImageHandles::ID = 0; + +NVPTXReplaceImageHandles::NVPTXReplaceImageHandles() + : MachineFunctionPass(ID) {} + +bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + InstrsToRemove.clear(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end(); + I != E; ++I) { + MachineInstr &MI = *I; + Changed |= processInstr(MI); + } + } + + // Now clean up any handle-access instructions + // This is needed in debug mode when code cleanup passes are not executed, + // but we need the handle access to be eliminated because they are not + // valid instructions when image handles are disabled. + for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(), + E = InstrsToRemove.end(); I != E; ++I) { + (*I)->eraseFromParent(); + } + return Changed; +} + +bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MCInstrDesc &MCID = MI.getDesc(); + + if (MCID.TSFlags & NVPTXII::IsTexFlag) { + // This is a texture fetch, so operand 4 is a texref and operand 5 is + // a samplerref + MachineOperand &TexHandle = MI.getOperand(4); + replaceImageHandle(TexHandle, MF); + + if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) { + MachineOperand &SampHandle = MI.getOperand(5); + replaceImageHandle(SampHandle, MF); + } + + return true; + } else if (MCID.TSFlags & NVPTXII::IsSuldMask) { + unsigned VecSize = + 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1); + + // For a surface load of vector size N, the Nth operand will be the surfref + MachineOperand &SurfHandle = MI.getOperand(VecSize); + + replaceImageHandle(SurfHandle, MF); + + return true; + } else if (MCID.TSFlags & NVPTXII::IsSustFlag) { + // This is a surface store, so operand 0 is a surfref + MachineOperand &SurfHandle = MI.getOperand(0); + + replaceImageHandle(SurfHandle, MF); + + return true; + } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) { + // This is a query, so operand 1 is a surfref/texref + MachineOperand &Handle = MI.getOperand(1); + + replaceImageHandle(Handle, MF); + + return true; + } + + return false; +} + +void NVPTXReplaceImageHandles:: +replaceImageHandle(MachineOperand &Op, MachineFunction &MF) { + unsigned Idx; + if (findIndexForHandle(Op, MF, Idx)) { + Op.ChangeToImmediate(Idx); + } +} + +bool NVPTXReplaceImageHandles:: +findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>(); + + assert(Op.isReg() && "Handle is not in a reg?"); + + // Which instruction defines the handle? + MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg()); + + switch (TexHandleDef.getOpcode()) { + case NVPTX::LD_i64_avar: { + // The handle is a parameter value being loaded, replace with the + // parameter symbol + const NVPTXSubtarget &ST = MF.getTarget().getSubtarget<NVPTXSubtarget>(); + if (ST.getDrvInterface() == NVPTX::CUDA) { + // For CUDA, we preserve the param loads coming from function arguments + return false; + } + + assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!"); + StringRef Sym = TexHandleDef.getOperand(6).getSymbolName(); + std::string ParamBaseName = MF.getName(); + ParamBaseName += "_param_"; + assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference"); + unsigned Param = atoi(Sym.data()+ParamBaseName.size()); + std::string NewSym; + raw_string_ostream NewSymStr(NewSym); + NewSymStr << MF.getFunction()->getName() << "_param_" << Param; + + InstrsToRemove.insert(&TexHandleDef); + Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()); + return true; + } + case NVPTX::texsurf_handles: { + // The handle is a global variable, replace with the global variable name + assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!"); + const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal(); + assert(GV->hasName() && "Global sampler must be named!"); + InstrsToRemove.insert(&TexHandleDef); + Idx = MFI->getImageHandleSymbolIndex(GV->getName().data()); + return true; + } + case NVPTX::nvvm_move_i64: + case TargetOpcode::COPY: { + bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx); + if (Res) { + InstrsToRemove.insert(&TexHandleDef); + } + return Res; + } + default: + llvm_unreachable("Unknown instruction operating on handle"); + } +} + +MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() { + return new NVPTXReplaceImageHandles(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h index f8a692e9b1a3..aa0436bf0da7 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h @@ -31,16 +31,16 @@ public: /// Override this as NVPTX has its own way of printing switching /// to a section. - virtual void PrintSwitchToSection(const MCAsmInfo &MAI, - raw_ostream &OS, - const MCExpr *Subsection) const {} + void PrintSwitchToSection(const MCAsmInfo &MAI, + raw_ostream &OS, + const MCExpr *Subsection) const override {} /// Base address of PTX sections is zero. - virtual bool isBaseAddressKnownZero() const { return true; } - virtual bool UseCodeAlign() const { return false; } - virtual bool isVirtualSection() const { return false; } - virtual std::string getLabelBeginName() const { return ""; } - virtual std::string getLabelEndName() const { return ""; } + bool isBaseAddressKnownZero() const override { return true; } + bool UseCodeAlign() const override { return false; } + bool isVirtualSection() const override { return false; } + std::string getLabelBeginName() const override { return ""; } + std::string getLabelEndName() const override { return ""; } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp deleted file mode 100644 index b64c30880b94..000000000000 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp +++ /dev/null @@ -1,73 +0,0 @@ -//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier --*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Split basic blocks so that a basic block that contains a barrier instruction -// only contains the barrier instruction. -// -//===----------------------------------------------------------------------===// - -#include "NVPTXSplitBBatBar.h" -#include "NVPTXUtilities.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/InstIterator.h" - -using namespace llvm; - -namespace llvm { FunctionPass *createSplitBBatBarPass(); } - -char NVPTXSplitBBatBar::ID = 0; - -bool NVPTXSplitBBatBar::runOnFunction(Function &F) { - - SmallVector<Instruction *, 4> SplitPoints; - bool changed = false; - - // Collect all the split points in SplitPoints - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - BasicBlock::iterator IB = BI->begin(); - BasicBlock::iterator II = IB; - BasicBlock::iterator IE = BI->end(); - - // Skit the first instruction. No splitting is needed at this - // point even if this is a bar. - while (II != IE) { - if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) { - Intrinsic::ID id = inst->getIntrinsicID(); - // If this is a barrier, split at this instruction - // and the next instruction. - if (llvm::isBarrierIntrinsic(id)) { - if (II != IB) - SplitPoints.push_back(II); - II++; - if ((II != IE) && (!II->isTerminator())) { - SplitPoints.push_back(II); - II++; - } - continue; - } - } - II++; - } - } - - for (unsigned i = 0; i != SplitPoints.size(); i++) { - changed = true; - Instruction *inst = SplitPoints[i]; - inst->getParent()->splitBasicBlock(inst, "bar_split"); - } - - return changed; -} - -// This interface will most likely not be necessary, because this pass will -// not be invoked by the driver, but will be used as a prerequisite to -// another pass. -FunctionPass *llvm::createSplitBBatBarPass() { return new NVPTXSplitBBatBar(); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h deleted file mode 100644 index bdafba9075a0..000000000000 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h +++ /dev/null @@ -1,41 +0,0 @@ -//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the declaration of the NVIDIA specific declarations -// for splitting basic blocks at barrier instructions. -// -//===----------------------------------------------------------------------===// - -#ifndef NVPTX_SPLIT_BB_AT_BAR_H -#define NVPTX_SPLIT_BB_AT_BAR_H - -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/Pass.h" - -namespace llvm { - -// actual analysis class, which is a functionpass -struct NVPTXSplitBBatBar : public FunctionPass { - static char ID; - - NVPTXSplitBBatBar() : FunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<MachineFunctionAnalysis>(); - } - virtual bool runOnFunction(Function &F); - - virtual const char *getPassName() const { - return "Split basic blocks at barrier"; - } -}; - -extern FunctionPass *createSplitBBatBarPass(); -} - -#endif //NVPTX_SPLIT_BB_AT_BAR_H diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 9771a176d8dc..d5cded218362 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,21 +12,54 @@ //===----------------------------------------------------------------------===// #include "NVPTXSubtarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-subtarget" + #define GET_SUBTARGETINFO_ENUM #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "NVPTXGenSubtargetInfo.inc" -using namespace llvm; - - // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} +static std::string computeDataLayout(bool is64Bit) { + std::string Ret = "e"; + + if (!is64Bit) + Ret += "-p:32:32"; + + Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + + return Ret; +} + +NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + // Provide the default CPU if we don't have one. + if (CPU.empty() && FS.size()) + llvm_unreachable("we are not using FeatureStr"); + TargetName = CPU.empty() ? "sm_20" : CPU; + + ParseSubtargetFeatures(TargetName, FS); + + // Set default to PTX 3.2 (CUDA 5.5) + if (PTXVersion == 0) { + PTXVersion = 32; + } + + return *this; +} + NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) + const std::string &FS, const TargetMachine &TM, + bool is64Bit) : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0), - SmVersion(20) { + SmVersion(20), DL(computeDataLayout(is64Bit)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) { Triple T(TT); @@ -34,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, drvInterface = NVPTX::NVCL; else drvInterface = NVPTX::CUDA; - - // Provide the default CPU if none - std::string defCPU = "sm_20"; - - ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS); - - // Get the TargetName from the FS if available - if (FS.empty() && CPU.empty()) - TargetName = defCPU; - else if (!CPU.empty()) - TargetName = CPU; - else - llvm_unreachable("we are not using FeatureStr"); - - // We default to PTX 3.1, but we cannot just default to it in the initializer - // since the attribute parser checks if the given option is >= the default. - // So if we set ptx31 as the default, the ptx30 attribute would never match. - // Instead, we use 0 as the default and manually set 31 if the default is - // used. - if (PTXVersion == 0) { - PTXVersion = 31; - } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 004be116a96c..4c41e4e470dd 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -15,13 +15,18 @@ #define NVPTXSUBTARGET_H #include "NVPTX.h" +#include "NVPTXFrameLowering.h" +#include "NVPTXISelLowering.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <string> #define GET_SUBTARGETINFO_HEADER #include "NVPTXGenSubtargetInfo.inc" -#include <string> - namespace llvm { class NVPTXSubtarget : public NVPTXGenSubtargetInfo { @@ -36,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned int SmVersion; + const DataLayout DL; // Calculates type size & alignment + NVPTXInstrInfo InstrInfo; + NVPTXTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + + // NVPTX does not have any call stack frame, but need a NVPTX specific + // FrameLowering class because TargetFrameLowering is abstract. + NVPTXFrameLowering FrameLowering; + public: /// This constructor initializes the data members to match that /// of the specified module. /// NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit); + const std::string &FS, const TargetMachine &TM, bool is64Bit); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const NVPTXRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } bool hasBrkPt() const { return SmVersion >= 11; } bool hasAtomRedG32() const { return SmVersion >= 11; } @@ -58,13 +81,24 @@ public: bool hasFMAF32() const { return SmVersion >= 20; } bool hasFMAF64() const { return SmVersion >= 13; } bool hasLDG() const { return SmVersion >= 32; } - bool hasLDU() const { return SmVersion >= 20; } + bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); } bool hasGenericLdSt() const { return SmVersion >= 20; } - inline bool hasHWROT32() const { return false; } - inline bool hasSWROT32() const { return true; } + inline bool hasHWROT32() const { return SmVersion >= 32; } + inline bool hasSWROT32() const { + return ((SmVersion >= 20) && (SmVersion < 32)); + } inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); } inline bool hasROT64() const { return SmVersion >= 20; } + bool hasImageHandles() const { + // Enable handles for Kepler+, where CUDA supports indirect surfaces and + // textures + if (getDrvInterface() == NVPTX::CUDA) + return (SmVersion >= 30); + + // Disabled, otherwise + return false; + } bool is64Bit() const { return Is64Bit; } unsigned int getSmVersion() const { return SmVersion; } @@ -73,22 +107,8 @@ public: unsigned getPTXVersion() const { return PTXVersion; } + NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - - std::string getDataLayout() const { - const char *p; - if (is64Bit()) - p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" - "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-" - "n16:32:64"; - else - p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" - "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-" - "n16:32:64"; - - return std::string(p); - } - }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 46edd6d83f65..069a1b9966f0 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -16,16 +16,14 @@ #include "NVPTX.h" #include "NVPTXAllocaHoisting.h" #include "NVPTXLowerAggrCopies.h" -#include "NVPTXSplitBBatBar.h" -#include "llvm/ADT/OwningPtr.h" #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Assembly/PrintModulePass.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCStreamer.h" @@ -50,6 +48,8 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); +void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); +void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -61,17 +61,18 @@ extern "C" void LLVMInitializeNVPTXTarget() { // but it's very NVPTX-specific. initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); + initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); + initializeNVPTXFavorNonGenericAddrSpacesPass( + *PassRegistry::getPassRegistry()); } -NVPTXTargetMachine::NVPTXTargetMachine( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool is64bit) +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering( - *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } @@ -101,14 +102,15 @@ public: return getTM<NVPTXTargetMachine>(); } - virtual void addIRPasses(); - virtual bool addInstSelector(); - virtual bool addPreRegAlloc(); - virtual bool addPostRegAlloc(); + void addIRPasses() override; + bool addInstSelector() override; + bool addPreRegAlloc() override; + bool addPostRegAlloc() override; + void addMachineSSAOptimization() override; - virtual FunctionPass *createTargetRegisterAllocator(bool) LLVM_OVERRIDE; - virtual void addFastRegAlloc(FunctionPass *RegAllocPass); - virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass); + FunctionPass *createTargetRegisterAllocator(bool) override; + void addFastRegAlloc(FunctionPass *RegAllocPass) override; + void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; }; } // end anonymous namespace @@ -128,15 +130,42 @@ void NVPTXPassConfig::addIRPasses() { disablePass(&BranchFolderPassID); disablePass(&TailDuplicateID); + addPass(createNVPTXImageOptimizerPass()); TargetPassConfig::addIRPasses(); + addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); + addPass(createNVPTXFavorNonGenericAddrSpacesPass()); + addPass(createSeparateConstOffsetFromGEPPass()); + // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used + // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates + // significantly better code than EarlyCSE for some of our benchmarks. + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); + // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave + // some dead code. We could remove dead code in an ad-hoc manner, but that + // requires manual work and might be error-prone. + // + // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts, + // and leave them unused. + // + // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the + // old index and some of its intermediate results may become unused. + addPass(createDeadCodeEliminationPass()); } bool NVPTXPassConfig::addInstSelector() { + const NVPTXSubtarget &ST = + getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>(); + addPass(createLowerAggrCopies()); - addPass(createSplitBBatBarPass()); addPass(createAllocaHoisting()); addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); + + if (!ST.hasImageHandles()) + addPass(createNVPTXReplaceImageHandlesPass()); + return false; } @@ -147,7 +176,7 @@ bool NVPTXPassConfig::addPostRegAlloc() { } FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { - return 0; // No reg alloc + return nullptr; // No reg alloc } void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { @@ -179,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { printAndVerify("After StackSlotColoring"); } + +void NVPTXPassConfig::addMachineSSAOptimization() { + // Pre-ra tail duplication. + if (addPass(&EarlyTailDuplicateID)) + printAndVerify("After Pre-RegAlloc TailDuplicate"); + + // Optimize PHIs before DCE: removing dead PHI cycles may make more + // instructions dead. + addPass(&OptimizePHIsID); + + // This pass merges large allocas. StackSlotColoring is a different pass + // which merges spill slots. + addPass(&StackColoringID); + + // If the target requests it, assign local variables to stack slots relative + // to one another and simplify frame index references where possible. + addPass(&LocalStackSlotAllocationID); + + // With optimization, dead code should already be eliminated. However + // there is one known exception: lowered code for arguments that are only + // used by tail calls, where the tail calls reuse the incoming stack + // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). + addPass(&DeadMachineInstructionElimID); + printAndVerify("After codegen DCE pass"); + + // Allow targets to insert passes that improve instruction level parallelism, + // like if-conversion. Such passes will typically need dominator trees and + // loop info, just like LICM and CSE below. + if (addILPOpts()) + printAndVerify("After ILP optimizations"); + + addPass(&MachineLICMID); + addPass(&MachineCSEID); + + addPass(&MachineSinkingID); + printAndVerify("After Machine LICM, CSE and Sinking passes"); + + addPass(&PeepholeOptimizerID); + printAndVerify("After codegen peephole optimization pass"); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index 5fbcf735b48f..a7a1c8f4e171 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -14,13 +14,8 @@ #ifndef NVPTX_TARGETMACHINE_H #define NVPTX_TARGETMACHINE_H -#include "ManagedStringPool.h" -#include "NVPTXFrameLowering.h" -#include "NVPTXISelLowering.h" -#include "NVPTXInstrInfo.h" -#include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" -#include "llvm/IR/DataLayout.h" +#include "ManagedStringPool.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -31,65 +26,52 @@ namespace llvm { /// class NVPTXTargetMachine : public LLVMTargetMachine { NVPTXSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - NVPTXInstrInfo InstrInfo; - NVPTXTargetLowering TLInfo; - TargetSelectionDAGInfo TSInfo; - - // NVPTX does not have any call stack frame, but need a NVPTX specific - // FrameLowering class because TargetFrameLowering is abstract. - NVPTXFrameLowering FrameLowering; // Hold Strings that can be free'd all together with NVPTXTargetMachine ManagedStringPool ManagedStrPool; - //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level, - // bool DisableVerify, MCContext *&OutCtx); - public: NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); - virtual const TargetFrameLowering *getFrameLowering() const { - return &FrameLowering; + const TargetFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); } - virtual const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } - virtual const DataLayout *getDataLayout() const { return &DL; } - virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; } - - virtual const NVPTXRegisterInfo *getRegisterInfo() const { - return &(InstrInfo.getRegisterInfo()); + const NVPTXInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); } - - virtual NVPTXTargetLowering *getTargetLowering() const { - return const_cast<NVPTXTargetLowering *>(&TLInfo); + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - - virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const { - return &TSInfo; + const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const NVPTXRegisterInfo *getRegisterInfo() const override { + return getSubtargetImpl()->getRegisterInfo(); } - //virtual bool addInstSelector(PassManagerBase &PM, - // CodeGenOpt::Level OptLevel); + const NVPTXTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); + } - //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level); + const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); + } ManagedStringPool *getManagedStrPool() const { return const_cast<ManagedStringPool *>(&ManagedStrPool); } - virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; // Emission of machine code through JITCodeEmitter is not supported. - virtual bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &, - bool = true) { + bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &, + bool = true) override { return true; } // Emission of machine code through MCJIT is not supported. - virtual bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &, - bool = true) { + bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &, + bool = true) override { return true; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 2a7394b79ae6..ba8086d78880 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -22,31 +22,31 @@ class NVPTXTargetObjectFile : public TargetLoweringObjectFile { public: NVPTXTargetObjectFile() { - TextSection = 0; - DataSection = 0; - BSSSection = 0; - ReadOnlySection = 0; + TextSection = nullptr; + DataSection = nullptr; + BSSSection = nullptr; + ReadOnlySection = nullptr; - StaticCtorSection = 0; - StaticDtorSection = 0; - LSDASection = 0; - EHFrameSection = 0; - DwarfAbbrevSection = 0; - DwarfInfoSection = 0; - DwarfLineSection = 0; - DwarfFrameSection = 0; - DwarfPubTypesSection = 0; - DwarfDebugInlineSection = 0; - DwarfStrSection = 0; - DwarfLocSection = 0; - DwarfARangesSection = 0; - DwarfRangesSection = 0; - DwarfMacroInfoSection = 0; + StaticCtorSection = nullptr; + StaticDtorSection = nullptr; + LSDASection = nullptr; + EHFrameSection = nullptr; + DwarfAbbrevSection = nullptr; + DwarfInfoSection = nullptr; + DwarfLineSection = nullptr; + DwarfFrameSection = nullptr; + DwarfPubTypesSection = nullptr; + DwarfDebugInlineSection = nullptr; + DwarfStrSection = nullptr; + DwarfLocSection = nullptr; + DwarfARangesSection = nullptr; + DwarfRangesSection = nullptr; + DwarfMacroInfoSection = nullptr; } virtual ~NVPTXTargetObjectFile(); - virtual void Initialize(MCContext &ctx, const TargetMachine &TM) { + void Initialize(MCContext &ctx, const TargetMachine &TM) override { TargetLoweringObjectFile::Initialize(ctx, TM); TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText()); DataSection = @@ -87,13 +87,14 @@ public: new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } - virtual const MCSection *getSectionForConstant(SectionKind Kind) const { + const MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override { return ReadOnlySection; } - virtual const MCSection * - getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler *Mang, const TargetMachine &TM) const { + const MCSection *getExplicitSectionGlobal(const GlobalValue *GV, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const override { return DataSection; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 6786eb02240c..a9fd190b7ff0 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -22,9 +22,9 @@ #include <map> #include <string> #include <vector> -//#include <iostream> #include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/InstIterator.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Support/MutexGuard.h" using namespace llvm; @@ -33,8 +33,15 @@ typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t; typedef std::map<const Module *, global_val_annot_t> per_module_annot_t; ManagedStatic<per_module_annot_t> annotationCache; +static sys::Mutex Lock; + +void llvm::clearAnnotationCache(const llvm::Module *Mod) { + MutexGuard Guard(Lock); + annotationCache->erase(Mod); +} static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { + MutexGuard Guard(Lock); assert(md && "Invalid mdnode for annotation"); assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); // start index = 1, to skip the global variable key @@ -60,6 +67,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { } static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { + MutexGuard Guard(Lock); NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations); if (!NMD) return; @@ -92,6 +100,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop, unsigned &retval) { + MutexGuard Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); @@ -105,6 +114,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop, bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop, std::vector<unsigned> &retval) { + MutexGuard Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); @@ -195,8 +205,37 @@ bool llvm::isImageWriteOnly(const llvm::Value &val) { return false; } +bool llvm::isImageReadWrite(const llvm::Value &val) { + if (const Argument *arg = dyn_cast<Argument>(&val)) { + const Function *func = arg->getParent(); + std::vector<unsigned> annot; + if (llvm::findAllNVVMAnnotation(func, + llvm::PropertyAnnotationNames[ + llvm::PROPERTY_ISREADWRITE_IMAGE_PARAM], + annot)) { + if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end()) + return true; + } + } + return false; +} + bool llvm::isImage(const llvm::Value &val) { - return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val); + return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val) || + llvm::isImageReadWrite(val); +} + +bool llvm::isManaged(const llvm::Value &val) { + if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { + unsigned annot; + if(llvm::findOneNVVMAnnotation(gv, + llvm::PropertyAnnotationNames[llvm::PROPERTY_MANAGED], + annot)) { + assert((annot == 1) && "Unexpected annotation on a managed symbol"); + return true; + } + } + return false; } std::string llvm::getTextureName(const llvm::Value &val) { @@ -354,12 +393,12 @@ llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) { const Value * llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) { if (processed.find(V) != processed.end()) - return NULL; + return nullptr; processed.insert(V); const Value *V2 = V->stripPointerCasts(); if (V2 != V && processed.find(V2) != processed.end()) - return NULL; + return nullptr; processed.insert(V2); V = V2; @@ -375,20 +414,20 @@ llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) { continue; } else if (const PHINode *PN = dyn_cast<PHINode>(V)) { if (V != V2 && processed.find(V) != processed.end()) - return NULL; + return nullptr; processed.insert(PN); - const Value *common = 0; + const Value *common = nullptr; for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { const Value *pv = PN->getIncomingValue(i); const Value *base = skipPointerTransfer(pv, processed); if (base) { - if (common == 0) + if (!common) common = base; else if (common != base) return PN; } } - if (common == 0) + if (!common) return PN; V = common; } @@ -406,7 +445,7 @@ BasicBlock *llvm::getParentBlock(Value *v) { if (Instruction *I = dyn_cast<Instruction>(v)) return I->getParent(); - return 0; + return nullptr; } Function *llvm::getParentFunction(Value *v) { @@ -419,13 +458,13 @@ Function *llvm::getParentFunction(Value *v) { if (BasicBlock *B = dyn_cast<BasicBlock>(v)) return B->getParent(); - return 0; + return nullptr; } // Dump a block by name void llvm::dumpBlock(Value *v, char *blockName) { Function *F = getParentFunction(v); - if (F == 0) + if (!F) return; for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) { @@ -440,8 +479,8 @@ void llvm::dumpBlock(Value *v, char *blockName) { // Find an instruction by name Instruction *llvm::getInst(Value *base, char *instName) { Function *F = getParentFunction(base); - if (F == 0) - return 0; + if (!F) + return nullptr; for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) { Instruction *I = &*it; @@ -450,7 +489,7 @@ Instruction *llvm::getInst(Value *base, char *instName) { } } - return 0; + return nullptr; } // Dump an instruction by nane diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h index a208004297d0..446bfa1e112c 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -28,6 +28,8 @@ namespace llvm { #define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly" #define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly" +void clearAnnotationCache(const llvm::Module *); + bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &); bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string, std::vector<unsigned> &); @@ -38,6 +40,8 @@ bool isSampler(const llvm::Value &); bool isImage(const llvm::Value &); bool isImageReadOnly(const llvm::Value &); bool isImageWriteOnly(const llvm::Value &); +bool isImageReadWrite(const llvm::Value &); +bool isManaged(const llvm::Value &); std::string getTextureName(const llvm::Value &); std::string getSurfaceName(const llvm::Value &); diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 7406207c94b5..a8d6b95ae4ec 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This pass replaces occurences of __nvvm_reflect("string") with an +// This pass replaces occurrences of __nvvm_reflect("string") with an // integer based on -nvvm-reflect-list string=<int> option given to this pass. // If an undefined string value is seen in a call to __nvvm_reflect("string"), // a default value of 0 will be used. @@ -18,13 +18,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Pass.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Constants.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_os_ostream.h" @@ -38,6 +39,8 @@ using namespace llvm; +#define DEBUG_TYPE "nvptx-reflect" + namespace llvm { void initializeNVVMReflectPass(PassRegistry &); } namespace { @@ -45,17 +48,16 @@ class NVVMReflect : public ModulePass { private: StringMap<int> VarMap; typedef DenseMap<std::string, int>::iterator VarMapIter; - Function *ReflectFunction; public: static char ID; - NVVMReflect() : ModulePass(ID), ReflectFunction(0) { + NVVMReflect() : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); } NVVMReflect(const StringMap<int> &Mapping) - : ModulePass(ID), ReflectFunction(0) { + : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); I != E; ++I) { @@ -63,9 +65,13 @@ public: } } - void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } - virtual bool runOnModule(Module &); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + bool runOnModule(Module &) override; +private: + bool handleFunction(Function *ReflectFunction); void setVarMap(); }; } @@ -84,7 +90,7 @@ NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden, char NVVMReflect::ID = 0; INITIALIZE_PASS(NVVMReflect, "nvvm-reflect", - "Replace occurences of __nvvm_reflect() calls with 0/1", false, + "Replace occurrences of __nvvm_reflect() calls with 0/1", false, false) static cl::list<std::string> @@ -116,19 +122,7 @@ void NVVMReflect::setVarMap() { } } -bool NVVMReflect::runOnModule(Module &M) { - if (!NVVMReflectEnabled) - return false; - - setVarMap(); - - ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); - - // If reflect function is not used, then there will be - // no entry in the module. - if (ReflectFunction == 0) - return false; - +bool NVVMReflect::handleFunction(Function *ReflectFunction) { // Validate _reflect function assert(ReflectFunction->isDeclaration() && "_reflect function should not have a body"); @@ -143,23 +137,23 @@ bool NVVMReflect::runOnModule(Module &M) { // ConstantArray can be found successfully, see if it can be // found in VarMap. If so, replace the uses of CallInst with the // value found in VarMap. If not, replace the use with value 0. - for (Value::use_iterator I = ReflectFunction->use_begin(), - E = ReflectFunction->use_end(); - I != E; ++I) { - assert(isa<CallInst>(*I) && "Only a call instruction can use _reflect"); - CallInst *Reflect = cast<CallInst>(*I); + for (User *U : ReflectFunction->users()) { + assert(isa<CallInst>(U) && "Only a call instruction can use _reflect"); + CallInst *Reflect = cast<CallInst>(U); assert((Reflect->getNumOperands() == 2) && "Only one operand expect for _reflect function"); // In cuda, we will have an extra constant-to-generic conversion of // the string. - const Value *conv = Reflect->getArgOperand(0); - assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion"); - const CallInst *ConvCall = cast<CallInst>(conv); - const Value *str = ConvCall->getArgOperand(0); - assert(isa<ConstantExpr>(str) && + const Value *Str = Reflect->getArgOperand(0); + if (isa<CallInst>(Str)) { + // CUDA path + const CallInst *ConvCall = cast<CallInst>(Str); + Str = ConvCall->getArgOperand(0); + } + assert(isa<ConstantExpr>(Str) && "Format of _reflect function not recognized"); - const ConstantExpr *GEP = cast<ConstantExpr>(str); + const ConstantExpr *GEP = cast<ConstantExpr>(Str); const Value *Sym = GEP->getOperand(0); assert(isa<Constant>(Sym) && "Format of _reflect function not recognized"); @@ -193,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) { ToRemove[i]->eraseFromParent(); return true; } + +bool NVVMReflect::runOnModule(Module &M) { + if (!NVVMReflectEnabled) + return false; + + setVarMap(); + + + bool Res = false; + std::string Name; + Type *Tys[1]; + Type *I8Ty = Type::getInt8Ty(M.getContext()); + Function *ReflectFunction; + + // Check for standard overloaded versions of llvm.nvvm.reflect + + for (unsigned i = 0; i != 5; ++i) { + Tys[0] = PointerType::get(I8Ty, i); + Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys); + ReflectFunction = M.getFunction(Name); + if(ReflectFunction != 0) { + Res |= handleFunction(ReflectFunction); + } + } + + ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); + // If reflect function is not used, then there will be + // no entry in the module. + if (ReflectFunction != 0) + Res |= handleFunction(ReflectFunction); + + return Res; +} diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h index 45cc0b8b67f2..02c5a94c3d03 100644 --- a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h +++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h @@ -1,5 +1,5 @@ -#ifndef __CL_COMMON_DEFINES_H__ -#define __CL_COMMON_DEFINES_H__ +#ifndef CL_COMMON_DEFINES_H +#define CL_COMMON_DEFINES_H // This file includes defines that are common to both kernel code and // the NVPTX back-end. @@ -119,4 +119,4 @@ typedef enum clk_sampler_type { #define CLK_LOCAL_MEM_FENCE (1 << 0) #define CLK_GLOBAL_MEM_FENCE (1 << 1) -#endif // __CL_COMMON_DEFINES_H__ +#endif // CL_COMMON_DEFINES_H |