diff options
Diffstat (limited to 'contrib/llvm/lib/Target/NVPTX')
26 files changed, 9058 insertions, 10090 deletions
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td index c77ddbc99789..aba37d363591 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td @@ -50,6 +50,8 @@ def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61", "Target SM 6.1">; def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62", "Target SM 6.2">; +def SM70 : SubtargetFeature<"sm_70", "SmVersion", "70", + "Target SM 7.0">; def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true", "Atomic operations with scope">; @@ -67,6 +69,8 @@ def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43", "Use PTX version 4.3">; def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50", "Use PTX version 5.0">; +def PTX60 : SubtargetFeature<"ptx60", "PTXVersion", "60", + "Use PTX version 6.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -87,6 +91,7 @@ def : Proc<"sm_53", [SM53, PTX42]>; def : Proc<"sm_60", [SM60, PTX50, SATOM]>; def : Proc<"sm_61", [SM61, PTX50, SATOM]>; def : Proc<"sm_62", [SM62, PTX50, SATOM]>; +def : Proc<"sm_70", [SM70, PTX60, SATOM]>; def NVPTXInstrInfo : InstrInfo { } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 0139646fc3f7..2aa395642c40 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -45,6 +45,9 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -75,10 +78,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <cassert> #include <cstdint> @@ -400,7 +400,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { O << " ("; if (isABI) { - if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) { + if (Ty->isFloatingPointTy() || (Ty->isIntegerTy() && !Ty->isIntegerTy(128))) { unsigned size = 0; if (auto *ITy = dyn_cast<IntegerType>(Ty)) { size = ITy->getBitWidth(); @@ -418,7 +418,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { } else if (isa<PointerType>(Ty)) { O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits() << " func_retval0"; - } else if (Ty->isAggregateType() || Ty->isVectorTy()) { + } else if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!getAlign(*F, 0, retAlignment)) @@ -457,8 +457,8 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF, raw_ostream &O) { - const Function *F = MF.getFunction(); - printReturnValStr(F, O); + const Function &F = MF.getFunction(); + printReturnValStr(&F, O); } // Return true if MBB is the header of a loop marked with @@ -502,13 +502,13 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { raw_svector_ostream O(Str); if (!GlobalsEmitted) { - emitGlobals(*MF->getFunction()->getParent()); + emitGlobals(*MF->getFunction().getParent()); GlobalsEmitted = true; } // Set up MRI = &MF->getRegInfo(); - F = MF->getFunction(); + F = &MF->getFunction(); emitLinkageDirective(F, O); if (isKernelFunction(*F)) O << ".entry "; @@ -536,7 +536,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() { SmallString<128> Str; raw_svector_ostream O(Str); - emitDemotedVars(MF->getFunction(), O); + emitDemotedVars(&MF->getFunction(), O); OutStreamer->EmitRawText(O.str()); } @@ -1425,6 +1425,14 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, else O << " .align " << GVar->getAlignment(); + // Special case for i128 + if (ETy->isIntegerTy(128)) { + O << " .b8 "; + getSymbol(GVar)->print(O, MAI); + O << "[16]"; + return; + } + if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) { O << " ."; O << getPTXFundamentalTypeStr(ETy); @@ -1551,7 +1559,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) { - if (Ty->isAggregateType() || Ty->isVectorTy()) { + if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type @@ -1700,8 +1708,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF, raw_ostream &O) { - const Function *F = MF.getFunction(); - emitFunctionParamList(F, O); + const Function &F = MF.getFunction(); + emitFunctionParamList(&F, O); } void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( @@ -2148,7 +2156,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) raw_string_ostream OS(S); OS << "Unsupported expression in static initializer: "; CE->printAsOperand(OS, /*PrintType=*/false, - !MF ? nullptr : MF->getFunction()->getParent()); + !MF ? nullptr : MF->getFunction().getParent()); report_fatal_error(OS.str()); } @@ -2162,7 +2170,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) raw_string_ostream OS(S); OS << "Unsupported expression in static initializer: "; CE->printAsOperand(OS, /*PrintType=*/ false, - !MF ? nullptr : MF->getFunction()->getParent()); + !MF ? nullptr : MF->getFunction().getParent()); report_fatal_error(OS.str()); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp index 7d4be8e809cf..f02c33f9249a 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -18,6 +18,7 @@ //===----------------------------------------------------------------------===// #include "NVPTX.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" @@ -61,6 +62,11 @@ bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { } } + // Do the same for local functions. + for (Function &F : M.functions()) + if (F.hasLocalLinkage()) + F.setName(cleanUpName(F.getName())); + return true; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 6ced2f6967cf..729f3ed7b79e 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -20,8 +20,8 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MachineLocation.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h index 320ca9a2f095..a802cf85d2e0 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H #define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class NVPTXSubtarget; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 274977254046..57e2acc0d7e0 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -496,11 +496,325 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { SelectCode(N); } +// Each instruction has four addressing variants. WMMA_VARIANTS() macro below +// constructs an array indexed by WmmaVariant which getWmmaLdVariant() uses to +// look up the intrinsic ID of particular variant. +enum WmmaVariant { + WMMA_VARIANT_ARI64, + WMMA_VARIANT_ARI64_STRIDE, + WMMA_VARIANT_AVAR, + WMMA_VARIANT_AVAR_STRIDE, +}; + +// clang-format off +#define WMMA_VARIANTS(base) \ + {{ base##_ari64, base##_ari64_stride, base##_avar, base##_avar_stride }} +// clang-format on + +static unsigned getWmmaLdVariant(WmmaVariant Variant, bool Stride, + const std::array<unsigned, 4> Variants) { + if (Stride) { + if (Variant == WMMA_VARIANT_ARI64) + Variant = WMMA_VARIANT_ARI64_STRIDE; + else if (Variant == WMMA_VARIANT_AVAR) + Variant = WMMA_VARIANT_AVAR_STRIDE; + } + return Variants[Variant]; +} + +static Optional<unsigned> +getWmmaLdStOpcode(unsigned IntrinsicID, + WmmaVariant Variant = WMMA_VARIANT_ARI64) { + switch (IntrinsicID) { + default: + return None; + // + // WMMA_LOAD_A f16 + // + case Intrinsic::nvvm_wmma_load_a_f16_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col)); + case Intrinsic::nvvm_wmma_load_a_f16_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row)); + case Intrinsic::nvvm_wmma_load_a_f16_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col)); + case Intrinsic::nvvm_wmma_load_a_f16_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row)); + case Intrinsic::nvvm_wmma_load_a_f16_col_shared: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared)); + case Intrinsic::nvvm_wmma_load_a_f16_row_shared: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared)); + case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared)); + case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared)); + case Intrinsic::nvvm_wmma_load_a_f16_col_global: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global)); + case Intrinsic::nvvm_wmma_load_a_f16_row_global: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global)); + case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global)); + case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global)); + + // + // WMMA_LOAD_B f16 + // + case Intrinsic::nvvm_wmma_load_b_f16_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col)); + case Intrinsic::nvvm_wmma_load_b_f16_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row)); + case Intrinsic::nvvm_wmma_load_b_f16_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col)); + case Intrinsic::nvvm_wmma_load_b_f16_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row)); + case Intrinsic::nvvm_wmma_load_b_f16_col_shared: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared)); + case Intrinsic::nvvm_wmma_load_b_f16_row_shared: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared)); + case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared)); + case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared)); + case Intrinsic::nvvm_wmma_load_b_f16_col_global: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global)); + case Intrinsic::nvvm_wmma_load_b_f16_row_global: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global)); + case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global)); + case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global)); + + // + // WMMA_LOAD_C f16 + // + case Intrinsic::nvvm_wmma_load_c_f16_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col)); + case Intrinsic::nvvm_wmma_load_c_f16_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row)); + case Intrinsic::nvvm_wmma_load_c_f16_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col)); + case Intrinsic::nvvm_wmma_load_c_f16_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row)); + case Intrinsic::nvvm_wmma_load_c_f16_col_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared)); + case Intrinsic::nvvm_wmma_load_c_f16_row_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared)); + case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared)); + case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared)); + case Intrinsic::nvvm_wmma_load_c_f16_col_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global)); + case Intrinsic::nvvm_wmma_load_c_f16_row_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global)); + case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global)); + case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global)); + + // + // WMMA_LOAD_C f32 + // + case Intrinsic::nvvm_wmma_load_c_f32_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col)); + case Intrinsic::nvvm_wmma_load_c_f32_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row)); + case Intrinsic::nvvm_wmma_load_c_f32_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col)); + case Intrinsic::nvvm_wmma_load_c_f32_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row)); + case Intrinsic::nvvm_wmma_load_c_f32_col_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared)); + case Intrinsic::nvvm_wmma_load_c_f32_row_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared)); + case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared)); + case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared)); + case Intrinsic::nvvm_wmma_load_c_f32_col_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global)); + case Intrinsic::nvvm_wmma_load_c_f32_row_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global)); + case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global)); + case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global)); + + // + // WMMA_STORE_D f16 + // + case Intrinsic::nvvm_wmma_store_d_f16_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col)); + case Intrinsic::nvvm_wmma_store_d_f16_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row)); + case Intrinsic::nvvm_wmma_store_d_f16_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col)); + case Intrinsic::nvvm_wmma_store_d_f16_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row)); + case Intrinsic::nvvm_wmma_store_d_f16_col_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared)); + case Intrinsic::nvvm_wmma_store_d_f16_row_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared)); + case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared)); + case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared)); + case Intrinsic::nvvm_wmma_store_d_f16_col_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global)); + case Intrinsic::nvvm_wmma_store_d_f16_row_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global)); + case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global)); + case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global)); + + // + // WMMA_STORE_D f32 + // + case Intrinsic::nvvm_wmma_store_d_f32_col: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col)); + case Intrinsic::nvvm_wmma_store_d_f32_row: + return getWmmaLdVariant(Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row)); + case Intrinsic::nvvm_wmma_store_d_f32_col_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col)); + case Intrinsic::nvvm_wmma_store_d_f32_row_stride: + return getWmmaLdVariant(Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row)); + case Intrinsic::nvvm_wmma_store_d_f32_col_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared)); + case Intrinsic::nvvm_wmma_store_d_f32_row_shared: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared)); + case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared)); + case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared)); + case Intrinsic::nvvm_wmma_store_d_f32_col_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global)); + case Intrinsic::nvvm_wmma_store_d_f32_row_global: + return getWmmaLdVariant( + Variant, /*Stride=*/false, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global)); + case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global)); + case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride: + return getWmmaLdVariant( + Variant, /*Stride=*/true, + WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global)); + } +} +#undef WMMA_VARIANTS + bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (getWmmaLdStOpcode(IID)) + return tryWMMA_LDST(N); + switch (IID) { default: return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + SelectMatchAll(N); + return true; case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_p: @@ -689,7 +1003,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, return true; // Load wasn't explicitly invariant. Attempt to infer invariance. - if (!isKernelFunction(*F->getFunction())) + if (!isKernelFunction(F->getFunction())) return false; // We use GetUnderlyingObjects() here instead of @@ -715,6 +1029,39 @@ bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { case Intrinsic::nvvm_texsurf_handle_internal: SelectTexSurfHandle(N); return true; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32: + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32: + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32: + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32: + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite: + return tryWMMA_MMA(N); } } @@ -726,6 +1073,36 @@ void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { MVT::i64, GlobalVal)); } +void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) { + SDLoc DL(N); + enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 }; + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + unsigned OpcodeIndex = + (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0; + SDValue MaskOp = N->getOperand(2); + SDValue ValueOp = N->getOperand(3); + if (ConstantSDNode *ValueConst = dyn_cast<ConstantSDNode>(ValueOp)) { + OpcodeIndex |= HAS_CONST_VALUE; + ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL, + ValueConst->getValueType(0)); + } + if (ConstantSDNode *MaskConst = dyn_cast<ConstantSDNode>(MaskOp)) { + OpcodeIndex |= HAS_CONST_MASK; + MaskOp = CurDAG->getTargetConstant(MaskConst->getZExtValue(), DL, + MaskConst->getValueType(0)); + } + // Maps {IS_I64, HAS_CONST_VALUE, HAS_CONST_MASK} -> opcode + unsigned Opcodes[8] = { + NVPTX::MATCH_ALLP_SYNC_32rr, NVPTX::MATCH_ALLP_SYNC_32ri, + NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii, + NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri, + NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii}; + SDNode *NewNode = CurDAG->getMachineNode( + Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other}, + {MaskOp, ValueOp}); + ReplaceNode(N, NewNode); +} + void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { SDValue Src = N->getOperand(0); AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); @@ -2271,9 +2648,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { } bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { - SDValue Chain = N->getOperand(0); unsigned Opc = 0; - SmallVector<SDValue, 8> Ops; switch (N->getOpcode()) { default: return false; @@ -2784,1211 +3159,518 @@ bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { } // Copy over operands - for (unsigned i = 1; i < N->getNumOperands(); ++i) { - Ops.push_back(N->getOperand(i)); - } + SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end()); + Ops.push_back(N->getOperand(0)); // Move chain to the back. - Ops.push_back(Chain); ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); return true; } bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { - SDValue Chain = N->getOperand(0); - SDValue TexHandle = N->getOperand(1); unsigned Opc = 0; - SmallVector<SDValue, 8> Ops; switch (N->getOpcode()) { default: return false; case NVPTXISD::Suld1DI8Clamp: Opc = NVPTX::SULD_1D_I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI16Clamp: Opc = NVPTX::SULD_1D_I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI32Clamp: Opc = NVPTX::SULD_1D_I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI64Clamp: Opc = NVPTX::SULD_1D_I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I8Clamp: Opc = NVPTX::SULD_1D_V2I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I16Clamp: Opc = NVPTX::SULD_1D_V2I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I32Clamp: Opc = NVPTX::SULD_1D_V2I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I64Clamp: Opc = NVPTX::SULD_1D_V2I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I8Clamp: Opc = NVPTX::SULD_1D_V4I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I16Clamp: Opc = NVPTX::SULD_1D_V4I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I32Clamp: Opc = NVPTX::SULD_1D_V4I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI8Clamp: Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI16Clamp: Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI32Clamp: Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI64Clamp: Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I8Clamp: Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I16Clamp: Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I32Clamp: Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I64Clamp: Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I8Clamp: Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I16Clamp: Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I32Clamp: Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI8Clamp: Opc = NVPTX::SULD_2D_I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI16Clamp: Opc = NVPTX::SULD_2D_I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI32Clamp: Opc = NVPTX::SULD_2D_I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI64Clamp: Opc = NVPTX::SULD_2D_I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I8Clamp: Opc = NVPTX::SULD_2D_V2I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I16Clamp: Opc = NVPTX::SULD_2D_V2I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I32Clamp: Opc = NVPTX::SULD_2D_V2I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I64Clamp: Opc = NVPTX::SULD_2D_V2I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I8Clamp: Opc = NVPTX::SULD_2D_V4I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I16Clamp: Opc = NVPTX::SULD_2D_V4I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I32Clamp: Opc = NVPTX::SULD_2D_V4I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI8Clamp: Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI16Clamp: Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI32Clamp: Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI64Clamp: Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I8Clamp: Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I16Clamp: Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I32Clamp: Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I64Clamp: Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I8Clamp: Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I16Clamp: Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I32Clamp: Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI8Clamp: Opc = NVPTX::SULD_3D_I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI16Clamp: Opc = NVPTX::SULD_3D_I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI32Clamp: Opc = NVPTX::SULD_3D_I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI64Clamp: Opc = NVPTX::SULD_3D_I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I8Clamp: Opc = NVPTX::SULD_3D_V2I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I16Clamp: Opc = NVPTX::SULD_3D_V2I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I32Clamp: Opc = NVPTX::SULD_3D_V2I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I64Clamp: Opc = NVPTX::SULD_3D_V2I64_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I8Clamp: Opc = NVPTX::SULD_3D_V4I8_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I16Clamp: Opc = NVPTX::SULD_3D_V4I16_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I32Clamp: Opc = NVPTX::SULD_3D_V4I32_CLAMP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI8Trap: Opc = NVPTX::SULD_1D_I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI16Trap: Opc = NVPTX::SULD_1D_I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI32Trap: Opc = NVPTX::SULD_1D_I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI64Trap: Opc = NVPTX::SULD_1D_I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I8Trap: Opc = NVPTX::SULD_1D_V2I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I16Trap: Opc = NVPTX::SULD_1D_V2I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I32Trap: Opc = NVPTX::SULD_1D_V2I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I64Trap: Opc = NVPTX::SULD_1D_V2I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I8Trap: Opc = NVPTX::SULD_1D_V4I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I16Trap: Opc = NVPTX::SULD_1D_V4I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I32Trap: Opc = NVPTX::SULD_1D_V4I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI8Trap: Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI16Trap: Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI32Trap: Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI64Trap: Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I8Trap: Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I16Trap: Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I32Trap: Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I64Trap: Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I8Trap: Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I16Trap: Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I32Trap: Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI8Trap: Opc = NVPTX::SULD_2D_I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI16Trap: Opc = NVPTX::SULD_2D_I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI32Trap: Opc = NVPTX::SULD_2D_I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI64Trap: Opc = NVPTX::SULD_2D_I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I8Trap: Opc = NVPTX::SULD_2D_V2I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I16Trap: Opc = NVPTX::SULD_2D_V2I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I32Trap: Opc = NVPTX::SULD_2D_V2I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I64Trap: Opc = NVPTX::SULD_2D_V2I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I8Trap: Opc = NVPTX::SULD_2D_V4I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I16Trap: Opc = NVPTX::SULD_2D_V4I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I32Trap: Opc = NVPTX::SULD_2D_V4I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI8Trap: Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI16Trap: Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI32Trap: Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI64Trap: Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I8Trap: Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I16Trap: Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I32Trap: Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I64Trap: Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I8Trap: Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I16Trap: Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I32Trap: Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI8Trap: Opc = NVPTX::SULD_3D_I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI16Trap: Opc = NVPTX::SULD_3D_I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI32Trap: Opc = NVPTX::SULD_3D_I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI64Trap: Opc = NVPTX::SULD_3D_I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I8Trap: Opc = NVPTX::SULD_3D_V2I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I16Trap: Opc = NVPTX::SULD_3D_V2I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I32Trap: Opc = NVPTX::SULD_3D_V2I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I64Trap: Opc = NVPTX::SULD_3D_V2I64_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I8Trap: Opc = NVPTX::SULD_3D_V4I8_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I16Trap: Opc = NVPTX::SULD_3D_V4I16_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I32Trap: Opc = NVPTX::SULD_3D_V4I32_TRAP; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI8Zero: Opc = NVPTX::SULD_1D_I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI16Zero: Opc = NVPTX::SULD_1D_I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI32Zero: Opc = NVPTX::SULD_1D_I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DI64Zero: Opc = NVPTX::SULD_1D_I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I8Zero: Opc = NVPTX::SULD_1D_V2I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I16Zero: Opc = NVPTX::SULD_1D_V2I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I32Zero: Opc = NVPTX::SULD_1D_V2I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV2I64Zero: Opc = NVPTX::SULD_1D_V2I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I8Zero: Opc = NVPTX::SULD_1D_V4I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I16Zero: Opc = NVPTX::SULD_1D_V4I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DV4I32Zero: Opc = NVPTX::SULD_1D_V4I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI8Zero: Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI16Zero: Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI32Zero: Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayI64Zero: Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I8Zero: Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I16Zero: Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I32Zero: Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV2I64Zero: Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I8Zero: Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I16Zero: Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld1DArrayV4I32Zero: Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI8Zero: Opc = NVPTX::SULD_2D_I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI16Zero: Opc = NVPTX::SULD_2D_I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI32Zero: Opc = NVPTX::SULD_2D_I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DI64Zero: Opc = NVPTX::SULD_2D_I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I8Zero: Opc = NVPTX::SULD_2D_V2I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I16Zero: Opc = NVPTX::SULD_2D_V2I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I32Zero: Opc = NVPTX::SULD_2D_V2I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV2I64Zero: Opc = NVPTX::SULD_2D_V2I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I8Zero: Opc = NVPTX::SULD_2D_V4I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I16Zero: Opc = NVPTX::SULD_2D_V4I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DV4I32Zero: Opc = NVPTX::SULD_2D_V4I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI8Zero: Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI16Zero: Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI32Zero: Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayI64Zero: Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I8Zero: Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I16Zero: Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I32Zero: Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV2I64Zero: Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I8Zero: Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I16Zero: Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld2DArrayV4I32Zero: Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI8Zero: Opc = NVPTX::SULD_3D_I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI16Zero: Opc = NVPTX::SULD_3D_I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI32Zero: Opc = NVPTX::SULD_3D_I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DI64Zero: Opc = NVPTX::SULD_3D_I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I8Zero: Opc = NVPTX::SULD_3D_V2I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I16Zero: Opc = NVPTX::SULD_3D_V2I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I32Zero: Opc = NVPTX::SULD_3D_V2I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV2I64Zero: Opc = NVPTX::SULD_3D_V2I64_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I8Zero: Opc = NVPTX::SULD_3D_V4I8_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I16Zero: Opc = NVPTX::SULD_3D_V4I16_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; case NVPTXISD::Suld3DV4I32Zero: Opc = NVPTX::SULD_3D_V4I32_ZERO; - Ops.push_back(TexHandle); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); - Ops.push_back(Chain); break; } + + // Copy over operands + SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end()); + Ops.push_back(N->getOperand(0)); // Move chain to the back. + ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); return true; } @@ -4386,3 +4068,172 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, } } } + +bool NVPTXDAGToDAGISel::tryWMMA_LDST(SDNode *N) { + SDValue Chain = N->getOperand(0); + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + SDValue Op1 = N->getOperand(2); + SDValue Addr, Offset, Base; + Optional<unsigned> Opcode; + SDLoc DL(N); + MemSDNode *MemSD = cast<MemIntrinsicSDNode>(N); + WmmaVariant Variant; + SmallVector<SDValue, 12> Ops; + bool isStore = N->getNumValues() == 1; // Store ops only return a chain. + + if (SelectDirectAddr(Op1, Addr)) { + Variant = WMMA_VARIANT_AVAR; + Ops.push_back(Addr); + } else if (SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) || + SelectADDRri64(Op1.getNode(), Op1, Base, Offset)) { + Variant = WMMA_VARIANT_ARI64; + Ops.push_back(Base); + Ops.push_back(Offset); + } else { + Variant = WMMA_VARIANT_AVAR; + Ops.push_back(Op1); + } + unsigned NumOps = N->getNumOperands(); + // Pass through the rest of the operands to the machine node. + for (unsigned i = 3; i < NumOps; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Chain); + + Opcode = getWmmaLdStOpcode(IID, Variant); + if (!Opcode) { + llvm::errs() << "tryWMMALD - no Opcode.\n"; + return false; + } + + EVT MemVT = MemSD->getMemoryVT(); + assert(MemVT.isVector() && "Expected vector return type."); + + SDNode *MN; + if (isStore) { + MN = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops); + } else { + SmallVector<EVT, 9> InstVTs(MemVT.getVectorNumElements(), + MemSD->getValueType(0)); + InstVTs.push_back(MVT::Other); + MN = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTs, Ops); + } + + ReplaceNode(N, MN); + return true; +} + +bool NVPTXDAGToDAGISel::tryWMMA_MMA(SDNode *N) { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + SDLoc DL(N); + unsigned Opc; + + switch (IID) { + default: + return false; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16: + Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32: + Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16: + Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32: + Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16: + Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32: + Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16: + Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32: + Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16: + Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32: + Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16: + Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32: + Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16: + Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32: + Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16: + Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16_satfinite; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32: + Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32; + break; + case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite: + Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32_satfinite; + break; + } + + SmallVector<SDValue, 24> Ops; + // Pass through operands and return value types to the machine node. + for (unsigned i = 1; i < N->getNumOperands(); ++i) + Ops.push_back(N->getOperand(i)); + SmallVector<EVT, 8> InstVTs(N->getNumValues(), N->getValueType(0)); + SDNode *MN = CurDAG->getMachineNode(Opc, DL, InstVTs, Ops); + ReplaceNode(N, MN); + return true; +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 8fc38e7c4612..b23c27581a17 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -58,6 +58,7 @@ private: bool tryIntrinsicNoChain(SDNode *N); bool tryIntrinsicChain(SDNode *N); void SelectTexSurfHandle(SDNode *N); + void SelectMatchAll(SDNode *N); bool tryLoad(SDNode *N); bool tryLoadVector(SDNode *N); bool tryLDGLDU(SDNode *N); @@ -73,6 +74,8 @@ private: bool tryConstantFP16(SDNode *N); bool SelectSETP_F16X2(SDNode *N); bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); + bool tryWMMA_LDST(SDNode *N); + bool tryWMMA_MMA(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f800d91f4093..f1e4251a44b5 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1,4667 +1,4788 @@ -//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that NVPTX uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTXISelLowering.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
-#include "NVPTX.h"
-#include "NVPTXSection.h"
-#include "NVPTXSubtarget.h"
-#include "NVPTXTargetMachine.h"
-#include "NVPTXTargetObjectFile.h"
-#include "NVPTXUtilities.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetCallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#define DEBUG_TYPE "nvptx-lower"
-
-using namespace llvm;
-
-static unsigned int uniqueCallSite = 0;
-
-static cl::opt<bool> sched4reg(
- "nvptx-sched4reg",
- cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
-
-static cl::opt<unsigned>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
- " 1: do it 2: do it aggressively"),
- cl::init(2));
-
-static cl::opt<int> UsePrecDivF32(
- "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
- " IEEE Compliant F32 div.rnd if available."),
- cl::init(2));
-
-static cl::opt<bool> UsePrecSqrtF32(
- "nvptx-prec-sqrtf32", cl::Hidden,
- cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
- cl::init(true));
-
-static cl::opt<bool> FtzEnabled(
- "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
- cl::init(false));
-
-int NVPTXTargetLowering::getDivF32Level() const {
- if (UsePrecDivF32.getNumOccurrences() > 0) {
- // If nvptx-prec-div32=N is used on the command-line, always honor it
- return UsePrecDivF32;
- } else {
- // Otherwise, use div.approx if fast math is enabled
- if (getTargetMachine().Options.UnsafeFPMath)
- return 0;
- else
- return 2;
- }
-}
-
-bool NVPTXTargetLowering::usePrecSqrtF32() const {
- if (UsePrecSqrtF32.getNumOccurrences() > 0) {
- // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
- return UsePrecSqrtF32;
- } else {
- // Otherwise, use sqrt.approx if fast math is enabled
- return !getTargetMachine().Options.UnsafeFPMath;
- }
-}
-
-bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
- // TODO: Get rid of this flag; there can be only one way to do this.
- if (FtzEnabled.getNumOccurrences() > 0) {
- // If nvptx-f32ftz is used on the command-line, always honor it
- return FtzEnabled;
- } else {
- const Function *F = MF.getFunction();
- // Otherwise, check for an nvptx-f32ftz attribute on the function
- if (F->hasFnAttribute("nvptx-f32ftz"))
- return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
- else
- return false;
- }
-}
-
-static bool IsPTXVectorType(MVT VT) {
- switch (VT.SimpleTy) {
- default:
- return false;
- case MVT::v2i1:
- case MVT::v4i1:
- case MVT::v2i8:
- case MVT::v4i8:
- case MVT::v2i16:
- case MVT::v4i16:
- case MVT::v2i32:
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v4f16:
- case MVT::v8f16: // <4 x f16x2>
- case MVT::v2f32:
- case MVT::v4f32:
- case MVT::v2f64:
- return true;
- }
-}
-
-/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
-/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
-/// into their primitive components.
-/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
-/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
-/// LowerCall, and LowerReturn.
-static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
- Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
- SmallVectorImpl<uint64_t> *Offsets = nullptr,
- uint64_t StartingOffset = 0) {
- SmallVector<EVT, 16> TempVTs;
- SmallVector<uint64_t, 16> TempOffsets;
-
- ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
- for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
- EVT VT = TempVTs[i];
- uint64_t Off = TempOffsets[i];
- // Split vectors into individual elements, except for v2f16, which
- // we will pass as a single scalar.
- if (VT.isVector()) {
- unsigned NumElts = VT.getVectorNumElements();
- EVT EltVT = VT.getVectorElementType();
- // Vectors with an even number of f16 elements will be passed to
- // us as an array of v2f16 elements. We must match this so we
- // stay in sync with Ins/Outs.
- if (EltVT == MVT::f16 && NumElts % 2 == 0) {
- EltVT = MVT::v2f16;
- NumElts /= 2;
- }
- for (unsigned j = 0; j != NumElts; ++j) {
- ValueVTs.push_back(EltVT);
- if (Offsets)
- Offsets->push_back(Off + j * EltVT.getStoreSize());
- }
- } else {
- ValueVTs.push_back(VT);
- if (Offsets)
- Offsets->push_back(Off);
- }
- }
-}
-
-// Check whether we can merge loads/stores of some of the pieces of a
-// flattened function parameter or return value into a single vector
-// load/store.
-//
-// The flattened parameter is represented as a list of EVTs and
-// offsets, and the whole structure is aligned to ParamAlignment. This
-// function determines whether we can load/store pieces of the
-// parameter starting at index Idx using a single vectorized op of
-// size AccessSize. If so, it returns the number of param pieces
-// covered by the vector op. Otherwise, it returns 1.
-static unsigned CanMergeParamLoadStoresStartingAt(
- unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
- assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
-
- // Can't vectorize if param alignment is not sufficient.
- if (AccessSize > ParamAlignment)
- return 1;
- // Can't vectorize if offset is not aligned.
- if (Offsets[Idx] & (AccessSize - 1))
- return 1;
-
- EVT EltVT = ValueVTs[Idx];
- unsigned EltSize = EltVT.getStoreSize();
-
- // Element is too large to vectorize.
- if (EltSize >= AccessSize)
- return 1;
-
- unsigned NumElts = AccessSize / EltSize;
- // Can't vectorize if AccessBytes if not a multiple of EltSize.
- if (AccessSize != EltSize * NumElts)
- return 1;
-
- // We don't have enough elements to vectorize.
- if (Idx + NumElts > ValueVTs.size())
- return 1;
-
- // PTX ISA can only deal with 2- and 4-element vector ops.
- if (NumElts != 4 && NumElts != 2)
- return 1;
-
- for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
- // Types do not match.
- if (ValueVTs[j] != EltVT)
- return 1;
-
- // Elements are not contiguous.
- if (Offsets[j] - Offsets[j - 1] != EltSize)
- return 1;
- }
- // OK. We can vectorize ValueVTs[i..i+NumElts)
- return NumElts;
-}
-
-// Flags for tracking per-element vectorization state of loads/stores
-// of a flattened function parameter or return value.
-enum ParamVectorizationFlags {
- PVF_INNER = 0x0, // Middle elements of a vector.
- PVF_FIRST = 0x1, // First element of the vector.
- PVF_LAST = 0x2, // Last element of the vector.
- // Scalar is effectively a 1-element vector.
- PVF_SCALAR = PVF_FIRST | PVF_LAST
-};
-
-// Computes whether and how we can vectorize the loads/stores of a
-// flattened function parameter or return value.
-//
-// The flattened parameter is represented as the list of ValueVTs and
-// Offsets, and is aligned to ParamAlignment bytes. We return a vector
-// of the same size as ValueVTs indicating how each piece should be
-// loaded/stored (i.e. as a scalar, or as part of a vector
-// load/store).
-static SmallVector<ParamVectorizationFlags, 16>
-VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
- const SmallVectorImpl<uint64_t> &Offsets,
- unsigned ParamAlignment) {
- // Set vector size to match ValueVTs and mark all elements as
- // scalars by default.
- SmallVector<ParamVectorizationFlags, 16> VectorInfo;
- VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
-
- // Check what we can vectorize using 128/64/32-bit accesses.
- for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
- // Skip elements we've already processed.
- assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
- for (unsigned AccessSize : {16, 8, 4, 2}) {
- unsigned NumElts = CanMergeParamLoadStoresStartingAt(
- I, AccessSize, ValueVTs, Offsets, ParamAlignment);
- // Mark vectorized elements.
- switch (NumElts) {
- default:
- llvm_unreachable("Unexpected return value");
- case 1:
- // Can't vectorize using this size, try next smaller size.
- continue;
- case 2:
- assert(I + 1 < E && "Not enough elements.");
- VectorInfo[I] = PVF_FIRST;
- VectorInfo[I + 1] = PVF_LAST;
- I += 1;
- break;
- case 4:
- assert(I + 3 < E && "Not enough elements.");
- VectorInfo[I] = PVF_FIRST;
- VectorInfo[I + 1] = PVF_INNER;
- VectorInfo[I + 2] = PVF_INNER;
- VectorInfo[I + 3] = PVF_LAST;
- I += 3;
- break;
- }
- // Break out of the inner loop because we've already succeeded
- // using largest possible AccessSize.
- break;
- }
- }
- return VectorInfo;
-}
-
-// NVPTXTargetLowering Constructor.
-NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
- const NVPTXSubtarget &STI)
- : TargetLowering(TM), nvTM(&TM), STI(STI) {
- // always lower memset, memcpy, and memmove intrinsics to load/store
- // instructions, rather
- // then generating calls to memset, mempcy or memmove.
- MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
- MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
- MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
-
- setBooleanContents(ZeroOrNegativeOneBooleanContent);
- setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
- // Jump is Expensive. Don't create extra control flow for 'and', 'or'
- // condition branches.
- setJumpIsExpensive(true);
-
- // Wide divides are _very_ slow. Try to reduce the width of the divide if
- // possible.
- addBypassSlowDiv(64, 32);
-
- // By default, use the Source scheduling
- if (sched4reg)
- setSchedulingPreference(Sched::RegPressure);
- else
- setSchedulingPreference(Sched::Source);
-
- auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
- LegalizeAction NoF16Action) {
- setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
- };
-
- addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
- addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
- addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
- addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
- addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
- addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
- addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
- addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
-
- // Conversion to/from FP16/FP16x2 is always legal.
- setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
-
- setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
- setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
-
- // Operations not directly supported by NVPTX.
- setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::BR_CC, MVT::f16, Expand);
- setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
- setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- setOperationAction(ISD::BR_CC, MVT::f64, Expand);
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::BR_CC, MVT::i8, Expand);
- setOperationAction(ISD::BR_CC, MVT::i16, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::i64, Expand);
- // Some SIGN_EXTEND_INREG can be done using cvt instruction.
- // For others we will expand to a SHL/SRA pair.
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-
- setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
- setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
- setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
- setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
-
- setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
- setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
-
- if (STI.hasROT64()) {
- setOperationAction(ISD::ROTL, MVT::i64, Legal);
- setOperationAction(ISD::ROTR, MVT::i64, Legal);
- } else {
- setOperationAction(ISD::ROTL, MVT::i64, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
- }
- if (STI.hasROT32()) {
- setOperationAction(ISD::ROTL, MVT::i32, Legal);
- setOperationAction(ISD::ROTR, MVT::i32, Legal);
- } else {
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
- setOperationAction(ISD::ROTR, MVT::i32, Expand);
- }
-
- setOperationAction(ISD::ROTL, MVT::i16, Expand);
- setOperationAction(ISD::ROTR, MVT::i16, Expand);
- setOperationAction(ISD::ROTL, MVT::i8, Expand);
- setOperationAction(ISD::ROTR, MVT::i8, Expand);
- setOperationAction(ISD::BSWAP, MVT::i16, Expand);
- setOperationAction(ISD::BSWAP, MVT::i32, Expand);
- setOperationAction(ISD::BSWAP, MVT::i64, Expand);
-
- // Indirect branch is not supported.
- // This also disables Jump Table creation.
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::BRIND, MVT::Other, Expand);
-
- setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
- setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-
- // We want to legalize constant related memmove and memcopy
- // intrinsics.
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
- // Turn FP truncstore into trunc + store.
- // FIXME: vector types should also be expanded
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
- // PTX does not support load / store predicate registers
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
-
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setTruncStoreAction(VT, MVT::i1, Expand);
- }
-
- // This is legal in NVPTX
- setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
-
- // TRAP can be lowered to PTX trap
- setOperationAction(ISD::TRAP, MVT::Other, Legal);
-
- setOperationAction(ISD::ADDC, MVT::i64, Expand);
- setOperationAction(ISD::ADDE, MVT::i64, Expand);
-
- // Register custom handling for vector loads/stores
- for (MVT VT : MVT::vector_valuetypes()) {
- if (IsPTXVectorType(VT)) {
- setOperationAction(ISD::LOAD, VT, Custom);
- setOperationAction(ISD::STORE, VT, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
- }
- }
-
- // Custom handling for i8 intrinsics
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
- for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
- setOperationAction(ISD::ABS, Ty, Legal);
- setOperationAction(ISD::SMIN, Ty, Legal);
- setOperationAction(ISD::SMAX, Ty, Legal);
- setOperationAction(ISD::UMIN, Ty, Legal);
- setOperationAction(ISD::UMAX, Ty, Legal);
-
- setOperationAction(ISD::CTPOP, Ty, Legal);
- setOperationAction(ISD::CTLZ, Ty, Legal);
- }
-
- setOperationAction(ISD::CTTZ, MVT::i16, Expand);
- setOperationAction(ISD::CTTZ, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ, MVT::i64, Expand);
-
- // PTX does not directly support SELP of i1, so promote to i32 first
- setOperationAction(ISD::SELECT, MVT::i1, Custom);
-
- // PTX cannot multiply two i64s in a single instruction.
- setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
- // We have some custom DAG combine patterns for these nodes
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::SREM);
- setTargetDAGCombine(ISD::UREM);
-
- // setcc for f16x2 needs special handling to prevent legalizer's
- // attempt to scalarize it due to v2i1 not being legal.
- if (STI.allowFP16Math())
- setTargetDAGCombine(ISD::SETCC);
-
- // Promote fp16 arithmetic if fp16 hardware isn't available or the
- // user passed --nvptx-no-fp16-math. The flag is useful because,
- // although sm_53+ GPUs have some sort of FP16 support in
- // hardware, only sm_53 and sm_60 have full implementation. Others
- // only have token amount of hardware and are likely to run faster
- // by using fp32 units instead.
- for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
- setFP16OperationAction(Op, MVT::f16, Legal, Promote);
- setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
- }
-
- // There's no neg.f16 instruction. Expand to (0-x).
- setOperationAction(ISD::FNEG, MVT::f16, Expand);
- setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
-
- // (would be) Library functions.
-
- // These map to conversion instructions for scalar FP types.
- for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
- ISD::FROUND, ISD::FTRUNC}) {
- setOperationAction(Op, MVT::f16, Legal);
- setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
- setOperationAction(Op, MVT::v2f16, Expand);
- }
-
- // 'Expand' implements FCOPYSIGN without calling an external library.
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-
- // These map to corresponding instructions for f32/f64. f16 must be
- // promoted to f32. v2f16 is expanded to f16, which is then promoted
- // to f32.
- for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
- ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
- setOperationAction(Op, MVT::f16, Promote);
- setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
- setOperationAction(Op, MVT::v2f16, Expand);
- }
- setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
- setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
- setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
-
- // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
- // No FPOW or FREM in PTX.
-
- // Now deduce the information based on the above mentioned
- // actions
- computeRegisterProperties(STI.getRegisterInfo());
-}
-
-const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch ((NVPTXISD::NodeType)Opcode) {
- case NVPTXISD::FIRST_NUMBER:
- break;
- case NVPTXISD::CALL:
- return "NVPTXISD::CALL";
- case NVPTXISD::RET_FLAG:
- return "NVPTXISD::RET_FLAG";
- case NVPTXISD::LOAD_PARAM:
- return "NVPTXISD::LOAD_PARAM";
- case NVPTXISD::Wrapper:
- return "NVPTXISD::Wrapper";
- case NVPTXISD::DeclareParam:
- return "NVPTXISD::DeclareParam";
- case NVPTXISD::DeclareScalarParam:
- return "NVPTXISD::DeclareScalarParam";
- case NVPTXISD::DeclareRet:
- return "NVPTXISD::DeclareRet";
- case NVPTXISD::DeclareScalarRet:
- return "NVPTXISD::DeclareScalarRet";
- case NVPTXISD::DeclareRetParam:
- return "NVPTXISD::DeclareRetParam";
- case NVPTXISD::PrintCall:
- return "NVPTXISD::PrintCall";
- case NVPTXISD::PrintConvergentCall:
- return "NVPTXISD::PrintConvergentCall";
- case NVPTXISD::PrintCallUni:
- return "NVPTXISD::PrintCallUni";
- case NVPTXISD::PrintConvergentCallUni:
- return "NVPTXISD::PrintConvergentCallUni";
- case NVPTXISD::LoadParam:
- return "NVPTXISD::LoadParam";
- case NVPTXISD::LoadParamV2:
- return "NVPTXISD::LoadParamV2";
- case NVPTXISD::LoadParamV4:
- return "NVPTXISD::LoadParamV4";
- case NVPTXISD::StoreParam:
- return "NVPTXISD::StoreParam";
- case NVPTXISD::StoreParamV2:
- return "NVPTXISD::StoreParamV2";
- case NVPTXISD::StoreParamV4:
- return "NVPTXISD::StoreParamV4";
- case NVPTXISD::StoreParamS32:
- return "NVPTXISD::StoreParamS32";
- case NVPTXISD::StoreParamU32:
- return "NVPTXISD::StoreParamU32";
- case NVPTXISD::CallArgBegin:
- return "NVPTXISD::CallArgBegin";
- case NVPTXISD::CallArg:
- return "NVPTXISD::CallArg";
- case NVPTXISD::LastCallArg:
- return "NVPTXISD::LastCallArg";
- case NVPTXISD::CallArgEnd:
- return "NVPTXISD::CallArgEnd";
- case NVPTXISD::CallVoid:
- return "NVPTXISD::CallVoid";
- case NVPTXISD::CallVal:
- return "NVPTXISD::CallVal";
- case NVPTXISD::CallSymbol:
- return "NVPTXISD::CallSymbol";
- case NVPTXISD::Prototype:
- return "NVPTXISD::Prototype";
- case NVPTXISD::MoveParam:
- return "NVPTXISD::MoveParam";
- case NVPTXISD::StoreRetval:
- return "NVPTXISD::StoreRetval";
- case NVPTXISD::StoreRetvalV2:
- return "NVPTXISD::StoreRetvalV2";
- case NVPTXISD::StoreRetvalV4:
- return "NVPTXISD::StoreRetvalV4";
- case NVPTXISD::PseudoUseParam:
- return "NVPTXISD::PseudoUseParam";
- case NVPTXISD::RETURN:
- return "NVPTXISD::RETURN";
- case NVPTXISD::CallSeqBegin:
- return "NVPTXISD::CallSeqBegin";
- case NVPTXISD::CallSeqEnd:
- return "NVPTXISD::CallSeqEnd";
- case NVPTXISD::CallPrototype:
- return "NVPTXISD::CallPrototype";
- case NVPTXISD::LoadV2:
- return "NVPTXISD::LoadV2";
- case NVPTXISD::LoadV4:
- return "NVPTXISD::LoadV4";
- case NVPTXISD::LDGV2:
- return "NVPTXISD::LDGV2";
- case NVPTXISD::LDGV4:
- return "NVPTXISD::LDGV4";
- case NVPTXISD::LDUV2:
- return "NVPTXISD::LDUV2";
- case NVPTXISD::LDUV4:
- return "NVPTXISD::LDUV4";
- case NVPTXISD::StoreV2:
- return "NVPTXISD::StoreV2";
- case NVPTXISD::StoreV4:
- return "NVPTXISD::StoreV4";
- case NVPTXISD::FUN_SHFL_CLAMP:
- return "NVPTXISD::FUN_SHFL_CLAMP";
- case NVPTXISD::FUN_SHFR_CLAMP:
- return "NVPTXISD::FUN_SHFR_CLAMP";
- case NVPTXISD::IMAD:
- return "NVPTXISD::IMAD";
- case NVPTXISD::SETP_F16X2:
- return "NVPTXISD::SETP_F16X2";
- case NVPTXISD::Dummy:
- return "NVPTXISD::Dummy";
- case NVPTXISD::MUL_WIDE_SIGNED:
- return "NVPTXISD::MUL_WIDE_SIGNED";
- case NVPTXISD::MUL_WIDE_UNSIGNED:
- return "NVPTXISD::MUL_WIDE_UNSIGNED";
- case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
- case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
- case NVPTXISD::Tex1DFloatFloatLevel:
- return "NVPTXISD::Tex1DFloatFloatLevel";
- case NVPTXISD::Tex1DFloatFloatGrad:
- return "NVPTXISD::Tex1DFloatFloatGrad";
- case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
- case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
- case NVPTXISD::Tex1DS32FloatLevel:
- return "NVPTXISD::Tex1DS32FloatLevel";
- case NVPTXISD::Tex1DS32FloatGrad:
- return "NVPTXISD::Tex1DS32FloatGrad";
- case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
- case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
- case NVPTXISD::Tex1DU32FloatLevel:
- return "NVPTXISD::Tex1DU32FloatLevel";
- case NVPTXISD::Tex1DU32FloatGrad:
- return "NVPTXISD::Tex1DU32FloatGrad";
- case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
- case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
- case NVPTXISD::Tex1DArrayFloatFloatLevel:
- return "NVPTXISD::Tex1DArrayFloatFloatLevel";
- case NVPTXISD::Tex1DArrayFloatFloatGrad:
- return "NVPTXISD::Tex1DArrayFloatFloatGrad";
- case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
- case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
- case NVPTXISD::Tex1DArrayS32FloatLevel:
- return "NVPTXISD::Tex1DArrayS32FloatLevel";
- case NVPTXISD::Tex1DArrayS32FloatGrad:
- return "NVPTXISD::Tex1DArrayS32FloatGrad";
- case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
- case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
- case NVPTXISD::Tex1DArrayU32FloatLevel:
- return "NVPTXISD::Tex1DArrayU32FloatLevel";
- case NVPTXISD::Tex1DArrayU32FloatGrad:
- return "NVPTXISD::Tex1DArrayU32FloatGrad";
- case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
- case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
- case NVPTXISD::Tex2DFloatFloatLevel:
- return "NVPTXISD::Tex2DFloatFloatLevel";
- case NVPTXISD::Tex2DFloatFloatGrad:
- return "NVPTXISD::Tex2DFloatFloatGrad";
- case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
- case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
- case NVPTXISD::Tex2DS32FloatLevel:
- return "NVPTXISD::Tex2DS32FloatLevel";
- case NVPTXISD::Tex2DS32FloatGrad:
- return "NVPTXISD::Tex2DS32FloatGrad";
- case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
- case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
- case NVPTXISD::Tex2DU32FloatLevel:
- return "NVPTXISD::Tex2DU32FloatLevel";
- case NVPTXISD::Tex2DU32FloatGrad:
- return "NVPTXISD::Tex2DU32FloatGrad";
- case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
- case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
- case NVPTXISD::Tex2DArrayFloatFloatLevel:
- return "NVPTXISD::Tex2DArrayFloatFloatLevel";
- case NVPTXISD::Tex2DArrayFloatFloatGrad:
- return "NVPTXISD::Tex2DArrayFloatFloatGrad";
- case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
- case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
- case NVPTXISD::Tex2DArrayS32FloatLevel:
- return "NVPTXISD::Tex2DArrayS32FloatLevel";
- case NVPTXISD::Tex2DArrayS32FloatGrad:
- return "NVPTXISD::Tex2DArrayS32FloatGrad";
- case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
- case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
- case NVPTXISD::Tex2DArrayU32FloatLevel:
- return "NVPTXISD::Tex2DArrayU32FloatLevel";
- case NVPTXISD::Tex2DArrayU32FloatGrad:
- return "NVPTXISD::Tex2DArrayU32FloatGrad";
- case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
- case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
- case NVPTXISD::Tex3DFloatFloatLevel:
- return "NVPTXISD::Tex3DFloatFloatLevel";
- case NVPTXISD::Tex3DFloatFloatGrad:
- return "NVPTXISD::Tex3DFloatFloatGrad";
- case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
- case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
- case NVPTXISD::Tex3DS32FloatLevel:
- return "NVPTXISD::Tex3DS32FloatLevel";
- case NVPTXISD::Tex3DS32FloatGrad:
- return "NVPTXISD::Tex3DS32FloatGrad";
- case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
- case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
- case NVPTXISD::Tex3DU32FloatLevel:
- return "NVPTXISD::Tex3DU32FloatLevel";
- case NVPTXISD::Tex3DU32FloatGrad:
- return "NVPTXISD::Tex3DU32FloatGrad";
- case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
- case NVPTXISD::TexCubeFloatFloatLevel:
- return "NVPTXISD::TexCubeFloatFloatLevel";
- case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
- case NVPTXISD::TexCubeS32FloatLevel:
- return "NVPTXISD::TexCubeS32FloatLevel";
- case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
- case NVPTXISD::TexCubeU32FloatLevel:
- return "NVPTXISD::TexCubeU32FloatLevel";
- case NVPTXISD::TexCubeArrayFloatFloat:
- return "NVPTXISD::TexCubeArrayFloatFloat";
- case NVPTXISD::TexCubeArrayFloatFloatLevel:
- return "NVPTXISD::TexCubeArrayFloatFloatLevel";
- case NVPTXISD::TexCubeArrayS32Float:
- return "NVPTXISD::TexCubeArrayS32Float";
- case NVPTXISD::TexCubeArrayS32FloatLevel:
- return "NVPTXISD::TexCubeArrayS32FloatLevel";
- case NVPTXISD::TexCubeArrayU32Float:
- return "NVPTXISD::TexCubeArrayU32Float";
- case NVPTXISD::TexCubeArrayU32FloatLevel:
- return "NVPTXISD::TexCubeArrayU32FloatLevel";
- case NVPTXISD::Tld4R2DFloatFloat:
- return "NVPTXISD::Tld4R2DFloatFloat";
- case NVPTXISD::Tld4G2DFloatFloat:
- return "NVPTXISD::Tld4G2DFloatFloat";
- case NVPTXISD::Tld4B2DFloatFloat:
- return "NVPTXISD::Tld4B2DFloatFloat";
- case NVPTXISD::Tld4A2DFloatFloat:
- return "NVPTXISD::Tld4A2DFloatFloat";
- case NVPTXISD::Tld4R2DS64Float:
- return "NVPTXISD::Tld4R2DS64Float";
- case NVPTXISD::Tld4G2DS64Float:
- return "NVPTXISD::Tld4G2DS64Float";
- case NVPTXISD::Tld4B2DS64Float:
- return "NVPTXISD::Tld4B2DS64Float";
- case NVPTXISD::Tld4A2DS64Float:
- return "NVPTXISD::Tld4A2DS64Float";
- case NVPTXISD::Tld4R2DU64Float:
- return "NVPTXISD::Tld4R2DU64Float";
- case NVPTXISD::Tld4G2DU64Float:
- return "NVPTXISD::Tld4G2DU64Float";
- case NVPTXISD::Tld4B2DU64Float:
- return "NVPTXISD::Tld4B2DU64Float";
- case NVPTXISD::Tld4A2DU64Float:
- return "NVPTXISD::Tld4A2DU64Float";
-
- case NVPTXISD::TexUnified1DFloatS32:
- return "NVPTXISD::TexUnified1DFloatS32";
- case NVPTXISD::TexUnified1DFloatFloat:
- return "NVPTXISD::TexUnified1DFloatFloat";
- case NVPTXISD::TexUnified1DFloatFloatLevel:
- return "NVPTXISD::TexUnified1DFloatFloatLevel";
- case NVPTXISD::TexUnified1DFloatFloatGrad:
- return "NVPTXISD::TexUnified1DFloatFloatGrad";
- case NVPTXISD::TexUnified1DS32S32:
- return "NVPTXISD::TexUnified1DS32S32";
- case NVPTXISD::TexUnified1DS32Float:
- return "NVPTXISD::TexUnified1DS32Float";
- case NVPTXISD::TexUnified1DS32FloatLevel:
- return "NVPTXISD::TexUnified1DS32FloatLevel";
- case NVPTXISD::TexUnified1DS32FloatGrad:
- return "NVPTXISD::TexUnified1DS32FloatGrad";
- case NVPTXISD::TexUnified1DU32S32:
- return "NVPTXISD::TexUnified1DU32S32";
- case NVPTXISD::TexUnified1DU32Float:
- return "NVPTXISD::TexUnified1DU32Float";
- case NVPTXISD::TexUnified1DU32FloatLevel:
- return "NVPTXISD::TexUnified1DU32FloatLevel";
- case NVPTXISD::TexUnified1DU32FloatGrad:
- return "NVPTXISD::TexUnified1DU32FloatGrad";
- case NVPTXISD::TexUnified1DArrayFloatS32:
- return "NVPTXISD::TexUnified1DArrayFloatS32";
- case NVPTXISD::TexUnified1DArrayFloatFloat:
- return "NVPTXISD::TexUnified1DArrayFloatFloat";
- case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
- return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
- case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
- return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
- case NVPTXISD::TexUnified1DArrayS32S32:
- return "NVPTXISD::TexUnified1DArrayS32S32";
- case NVPTXISD::TexUnified1DArrayS32Float:
- return "NVPTXISD::TexUnified1DArrayS32Float";
- case NVPTXISD::TexUnified1DArrayS32FloatLevel:
- return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
- case NVPTXISD::TexUnified1DArrayS32FloatGrad:
- return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
- case NVPTXISD::TexUnified1DArrayU32S32:
- return "NVPTXISD::TexUnified1DArrayU32S32";
- case NVPTXISD::TexUnified1DArrayU32Float:
- return "NVPTXISD::TexUnified1DArrayU32Float";
- case NVPTXISD::TexUnified1DArrayU32FloatLevel:
- return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
- case NVPTXISD::TexUnified1DArrayU32FloatGrad:
- return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
- case NVPTXISD::TexUnified2DFloatS32:
- return "NVPTXISD::TexUnified2DFloatS32";
- case NVPTXISD::TexUnified2DFloatFloat:
- return "NVPTXISD::TexUnified2DFloatFloat";
- case NVPTXISD::TexUnified2DFloatFloatLevel:
- return "NVPTXISD::TexUnified2DFloatFloatLevel";
- case NVPTXISD::TexUnified2DFloatFloatGrad:
- return "NVPTXISD::TexUnified2DFloatFloatGrad";
- case NVPTXISD::TexUnified2DS32S32:
- return "NVPTXISD::TexUnified2DS32S32";
- case NVPTXISD::TexUnified2DS32Float:
- return "NVPTXISD::TexUnified2DS32Float";
- case NVPTXISD::TexUnified2DS32FloatLevel:
- return "NVPTXISD::TexUnified2DS32FloatLevel";
- case NVPTXISD::TexUnified2DS32FloatGrad:
- return "NVPTXISD::TexUnified2DS32FloatGrad";
- case NVPTXISD::TexUnified2DU32S32:
- return "NVPTXISD::TexUnified2DU32S32";
- case NVPTXISD::TexUnified2DU32Float:
- return "NVPTXISD::TexUnified2DU32Float";
- case NVPTXISD::TexUnified2DU32FloatLevel:
- return "NVPTXISD::TexUnified2DU32FloatLevel";
- case NVPTXISD::TexUnified2DU32FloatGrad:
- return "NVPTXISD::TexUnified2DU32FloatGrad";
- case NVPTXISD::TexUnified2DArrayFloatS32:
- return "NVPTXISD::TexUnified2DArrayFloatS32";
- case NVPTXISD::TexUnified2DArrayFloatFloat:
- return "NVPTXISD::TexUnified2DArrayFloatFloat";
- case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
- return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
- case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
- return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
- case NVPTXISD::TexUnified2DArrayS32S32:
- return "NVPTXISD::TexUnified2DArrayS32S32";
- case NVPTXISD::TexUnified2DArrayS32Float:
- return "NVPTXISD::TexUnified2DArrayS32Float";
- case NVPTXISD::TexUnified2DArrayS32FloatLevel:
- return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
- case NVPTXISD::TexUnified2DArrayS32FloatGrad:
- return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
- case NVPTXISD::TexUnified2DArrayU32S32:
- return "NVPTXISD::TexUnified2DArrayU32S32";
- case NVPTXISD::TexUnified2DArrayU32Float:
- return "NVPTXISD::TexUnified2DArrayU32Float";
- case NVPTXISD::TexUnified2DArrayU32FloatLevel:
- return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
- case NVPTXISD::TexUnified2DArrayU32FloatGrad:
- return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
- case NVPTXISD::TexUnified3DFloatS32:
- return "NVPTXISD::TexUnified3DFloatS32";
- case NVPTXISD::TexUnified3DFloatFloat:
- return "NVPTXISD::TexUnified3DFloatFloat";
- case NVPTXISD::TexUnified3DFloatFloatLevel:
- return "NVPTXISD::TexUnified3DFloatFloatLevel";
- case NVPTXISD::TexUnified3DFloatFloatGrad:
- return "NVPTXISD::TexUnified3DFloatFloatGrad";
- case NVPTXISD::TexUnified3DS32S32:
- return "NVPTXISD::TexUnified3DS32S32";
- case NVPTXISD::TexUnified3DS32Float:
- return "NVPTXISD::TexUnified3DS32Float";
- case NVPTXISD::TexUnified3DS32FloatLevel:
- return "NVPTXISD::TexUnified3DS32FloatLevel";
- case NVPTXISD::TexUnified3DS32FloatGrad:
- return "NVPTXISD::TexUnified3DS32FloatGrad";
- case NVPTXISD::TexUnified3DU32S32:
- return "NVPTXISD::TexUnified3DU32S32";
- case NVPTXISD::TexUnified3DU32Float:
- return "NVPTXISD::TexUnified3DU32Float";
- case NVPTXISD::TexUnified3DU32FloatLevel:
- return "NVPTXISD::TexUnified3DU32FloatLevel";
- case NVPTXISD::TexUnified3DU32FloatGrad:
- return "NVPTXISD::TexUnified3DU32FloatGrad";
- case NVPTXISD::TexUnifiedCubeFloatFloat:
- return "NVPTXISD::TexUnifiedCubeFloatFloat";
- case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
- return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
- case NVPTXISD::TexUnifiedCubeS32Float:
- return "NVPTXISD::TexUnifiedCubeS32Float";
- case NVPTXISD::TexUnifiedCubeS32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
- case NVPTXISD::TexUnifiedCubeU32Float:
- return "NVPTXISD::TexUnifiedCubeU32Float";
- case NVPTXISD::TexUnifiedCubeU32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
- return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
- case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayS32Float:
- return "NVPTXISD::TexUnifiedCubeArrayS32Float";
- case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
- case NVPTXISD::TexUnifiedCubeArrayU32Float:
- return "NVPTXISD::TexUnifiedCubeArrayU32Float";
- case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
- return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
- case NVPTXISD::Tld4UnifiedR2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
- case NVPTXISD::Tld4UnifiedG2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
- case NVPTXISD::Tld4UnifiedB2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
- case NVPTXISD::Tld4UnifiedA2DFloatFloat:
- return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
- case NVPTXISD::Tld4UnifiedR2DS64Float:
- return "NVPTXISD::Tld4UnifiedR2DS64Float";
- case NVPTXISD::Tld4UnifiedG2DS64Float:
- return "NVPTXISD::Tld4UnifiedG2DS64Float";
- case NVPTXISD::Tld4UnifiedB2DS64Float:
- return "NVPTXISD::Tld4UnifiedB2DS64Float";
- case NVPTXISD::Tld4UnifiedA2DS64Float:
- return "NVPTXISD::Tld4UnifiedA2DS64Float";
- case NVPTXISD::Tld4UnifiedR2DU64Float:
- return "NVPTXISD::Tld4UnifiedR2DU64Float";
- case NVPTXISD::Tld4UnifiedG2DU64Float:
- return "NVPTXISD::Tld4UnifiedG2DU64Float";
- case NVPTXISD::Tld4UnifiedB2DU64Float:
- return "NVPTXISD::Tld4UnifiedB2DU64Float";
- case NVPTXISD::Tld4UnifiedA2DU64Float:
- return "NVPTXISD::Tld4UnifiedA2DU64Float";
-
- case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
- case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
- case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
- case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
- case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
- case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
- case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
- case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
- case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
- case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
- case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
-
- case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
- case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
- case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
- case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
- case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
- case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
- case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
- case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
- case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
- case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
- case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
-
- case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
- case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
- case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
- case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
- case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
- case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
- case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
- case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
- case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
- case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
- case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
-
- case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
- case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
- case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
- case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
- case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
- case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
- case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
- case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
- case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
- case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
- case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
-
- case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
- case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
- case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
- case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
- case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
- case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
- case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
- case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
- case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
- case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
- case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
-
- case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
- case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
- case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
- case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
- case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
- case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
- case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
- case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
- case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
- case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
- case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
-
- case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
- case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
- case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
- case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
- case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
- case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
- case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
- case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
- case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
- case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
- case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
-
- case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
- case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
- case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
- case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
- case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
- case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
- case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
- case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
- case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
- case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
- case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
-
- case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
- case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
- case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
- case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
- case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
- case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
- case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
- case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
- case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
- case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
- case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
-
- case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
- case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
- case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
- case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
- case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
- case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
- case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
- case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
- case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
- case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
- case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
-
- case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
- case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
- case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
- case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
- case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
- case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
- case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
- case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
- case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
- case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
- case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
-
- case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
- case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
- case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
- case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
- case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
- case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
- case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
- case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
- case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
- case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
- case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
-
- case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
- case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
- case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
- case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
- case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
- case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
- case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
- case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
- case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
- case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
- case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
-
- case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
- case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
- case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
- case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
- case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
- case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
- case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
- case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
- case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
- case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
- case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
-
- case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
- case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
- case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
- case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
- case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
- case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
- case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
- case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
- case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
- case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
- case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
- }
- return nullptr;
-}
-
-TargetLoweringBase::LegalizeTypeAction
-NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
- if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
- return TypeSplitVector;
- if (VT == MVT::v2f16)
- return TypeLegal;
- return TargetLoweringBase::getPreferredVectorAction(VT);
-}
-
-SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
- int Enabled, int &ExtraSteps,
- bool &UseOneConst,
- bool Reciprocal) const {
- if (!(Enabled == ReciprocalEstimate::Enabled ||
- (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
- return SDValue();
-
- if (ExtraSteps == ReciprocalEstimate::Unspecified)
- ExtraSteps = 0;
-
- SDLoc DL(Operand);
- EVT VT = Operand.getValueType();
- bool Ftz = useF32FTZ(DAG.getMachineFunction());
-
- auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(IID, DL, MVT::i32), Operand);
- };
-
- // The sqrt and rsqrt refinement processes assume we always start out with an
- // approximation of the rsqrt. Therefore, if we're going to do any refinement
- // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
- // any refinement, we must return a regular sqrt.
- if (Reciprocal || ExtraSteps > 0) {
- if (VT == MVT::f32)
- return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
- : Intrinsic::nvvm_rsqrt_approx_f);
- else if (VT == MVT::f64)
- return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
- else
- return SDValue();
- } else {
- if (VT == MVT::f32)
- return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
- : Intrinsic::nvvm_sqrt_approx_f);
- else {
- // There's no sqrt.approx.f64 instruction, so we emit
- // reciprocal(rsqrt(x)). This is faster than
- // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
- // x * rsqrt(x).)
- return DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
- MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
- }
- }
-}
-
-SDValue
-NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
- return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
-}
-
-std::string NVPTXTargetLowering::getPrototype(
- const DataLayout &DL, Type *retTy, const ArgListTy &Args,
- const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
- const ImmutableCallSite *CS) const {
- auto PtrVT = getPointerTy(DL);
-
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return "";
-
- std::stringstream O;
- O << "prototype_" << uniqueCallSite << " : .callprototype ";
-
- if (retTy->getTypeID() == Type::VoidTyID) {
- O << "()";
- } else {
- O << "(";
- if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
- unsigned size = 0;
- if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
- size = ITy->getBitWidth();
- } else {
- assert(retTy->isFloatingPointTy() &&
- "Floating point type expected here");
- size = retTy->getPrimitiveSizeInBits();
- }
- // PTX ABI requires all scalar return values to be at least 32
- // bits in size. fp16 normally uses .b16 as its storage type in
- // PTX, so its size must be adjusted here, too.
- if (size < 32)
- size = 32;
-
- O << ".param .b" << size << " _";
- } else if (isa<PointerType>(retTy)) {
- O << ".param .b" << PtrVT.getSizeInBits() << " _";
- } else if (retTy->isAggregateType() || retTy->isVectorTy()) {
- auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
- O << ".param .align " << retAlignment << " .b8 _["
- << DL.getTypeAllocSize(retTy) << "]";
- } else {
- llvm_unreachable("Unknown return type");
- }
- O << ") ";
- }
- O << "_ (";
-
- bool first = true;
-
- unsigned OIdx = 0;
- for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
- Type *Ty = Args[i].Ty;
- if (!first) {
- O << ", ";
- }
- first = false;
-
- if (!Outs[OIdx].Flags.isByVal()) {
- if (Ty->isAggregateType() || Ty->isVectorTy()) {
- unsigned align = 0;
- const CallInst *CallI = cast<CallInst>(CS->getInstruction());
- // +1 because index 0 is reserved for return type alignment
- if (!getAlign(*CallI, i + 1, align))
- align = DL.getABITypeAlignment(Ty);
- unsigned sz = DL.getTypeAllocSize(Ty);
- O << ".param .align " << align << " .b8 ";
- O << "_";
- O << "[" << sz << "]";
- // update the index for Outs
- SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*this, DL, Ty, vtparts);
- if (unsigned len = vtparts.size())
- OIdx += len - 1;
- continue;
- }
- // i8 types in IR will be i16 types in SDAG
- assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
- (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
- "type mismatch between callee prototype and arguments");
- // scalar type
- unsigned sz = 0;
- if (isa<IntegerType>(Ty)) {
- sz = cast<IntegerType>(Ty)->getBitWidth();
- if (sz < 32)
- sz = 32;
- } else if (isa<PointerType>(Ty)) {
- sz = PtrVT.getSizeInBits();
- } else if (Ty->isHalfTy())
- // PTX ABI requires all scalar parameters to be at least 32
- // bits in size. fp16 normally uses .b16 as its storage type
- // in PTX, so its size must be adjusted here, too.
- sz = 32;
- else
- sz = Ty->getPrimitiveSizeInBits();
- O << ".param .b" << sz << " ";
- O << "_";
- continue;
- }
- auto *PTy = dyn_cast<PointerType>(Ty);
- assert(PTy && "Param with byval attribute should be a pointer type");
- Type *ETy = PTy->getElementType();
-
- unsigned align = Outs[OIdx].Flags.getByValAlign();
- unsigned sz = DL.getTypeAllocSize(ETy);
- O << ".param .align " << align << " .b8 ";
- O << "_";
- O << "[" << sz << "]";
- }
- O << ");";
- return O.str();
-}
-
-unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
- const ImmutableCallSite *CS,
- Type *Ty, unsigned Idx,
- const DataLayout &DL) const {
- if (!CS) {
- // CallSite is zero, fallback to ABI type alignment
- return DL.getABITypeAlignment(Ty);
- }
-
- unsigned Align = 0;
- const Value *DirectCallee = CS->getCalledFunction();
-
- if (!DirectCallee) {
- // We don't have a direct function symbol, but that may be because of
- // constant cast instructions in the call.
- const Instruction *CalleeI = CS->getInstruction();
- assert(CalleeI && "Call target is not a function or derived value?");
-
- // With bitcast'd call targets, the instruction will be the call
- if (isa<CallInst>(CalleeI)) {
- // Check if we have call alignment metadata
- if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
- return Align;
-
- const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
- // Ignore any bitcast instructions
- while (isa<ConstantExpr>(CalleeV)) {
- const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
- if (!CE->isCast())
- break;
- // Look through the bitcast
- CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
- }
-
- // We have now looked past all of the bitcasts. Do we finally have a
- // Function?
- if (isa<Function>(CalleeV))
- DirectCallee = CalleeV;
- }
- }
-
- // Check for function alignment information if we found that the
- // ultimate target is a Function
- if (DirectCallee)
- if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
- return Align;
-
- // Call is indirect or alignment information is not available, fall back to
- // the ABI type alignment
- return DL.getABITypeAlignment(Ty);
-}
-
-SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const {
- SelectionDAG &DAG = CLI.DAG;
- SDLoc dl = CLI.DL;
- SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
- SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
- SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
- SDValue Chain = CLI.Chain;
- SDValue Callee = CLI.Callee;
- bool &isTailCall = CLI.IsTailCall;
- ArgListTy &Args = CLI.getArgs();
- Type *RetTy = CLI.RetTy;
- ImmutableCallSite *CS = CLI.CS;
- const DataLayout &DL = DAG.getDataLayout();
-
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
-
- SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
- SDValue InFlag = Chain.getValue(1);
-
- unsigned paramCount = 0;
- // Args.size() and Outs.size() need not match.
- // Outs.size() will be larger
- // * if there is an aggregate argument with multiple fields (each field
- // showing up separately in Outs)
- // * if there is a vector argument with more than typical vector-length
- // elements (generally if more than 4) where each vector element is
- // individually present in Outs.
- // So a different index should be used for indexing into Outs/OutVals.
- // See similar issue in LowerFormalArguments.
- unsigned OIdx = 0;
- // Declare the .params or .reg need to pass values
- // to the function
- for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
- EVT VT = Outs[OIdx].VT;
- Type *Ty = Args[i].Ty;
-
- if (!Outs[OIdx].Flags.isByVal()) {
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
- unsigned ArgAlign =
- getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
- unsigned AllocSize = DL.getTypeAllocSize(Ty);
- SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- bool NeedAlign; // Does argument declaration specify alignment?
- if (Ty->isAggregateType() || Ty->isVectorTy()) {
- // declare .param .align <align> .b8 .param<n>[<size>];
- SDValue DeclareParamOps[] = {
- Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
- DeclareParamOps);
- NeedAlign = true;
- } else {
- // declare .param .b<size> .param<n>;
- if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
- // PTX ABI requires integral types to be at least 32 bits in
- // size. FP16 is loaded/stored using i16, so it's handled
- // here as well.
- AllocSize = 4;
- }
- SDValue DeclareScalarParamOps[] = {
- Chain, DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(AllocSize * 8, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
- DeclareScalarParamOps);
- NeedAlign = false;
- }
- InFlag = Chain.getValue(1);
-
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
- // than 32-bits are sign extended or zero extended, depending on
- // whether they are signed or unsigned types. This case applies
- // only to scalar parameters and not to aggregate values.
- bool ExtendIntegerParam =
- Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
-
- auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
- SmallVector<SDValue, 6> StoreOperands;
- for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
- // New store.
- if (VectorInfo[j] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Unfinished preceeding store.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
- StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
- }
-
- EVT EltVT = VTs[j];
- SDValue StVal = OutVals[OIdx];
- if (ExtendIntegerParam) {
- assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
- // zext/sext to i32
- StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND,
- dl, MVT::i32, StVal);
- } else if (EltVT.getSizeInBits() < 16) {
- // Use 16-bit registers for small stores as it's the
- // smallest general purpose register size supported by NVPTX.
- StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
- }
-
- // Record the value to store.
- StoreOperands.push_back(StVal);
-
- if (VectorInfo[j] & PVF_LAST) {
- unsigned NumElts = StoreOperands.size() - 3;
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreParam;
- break;
- case 2:
- Op = NVPTXISD::StoreParamV2;
- break;
- case 4:
- Op = NVPTXISD::StoreParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
-
- StoreOperands.push_back(InFlag);
-
- // Adjust type of the store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
- unsigned EltAlign =
- NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
-
- Chain = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), EltAlign,
- /* Volatile */ false, /* ReadMem */ false,
- /* WriteMem */ true, /* Size */ 0);
- InFlag = Chain.getValue(1);
-
- // Cleanup.
- StoreOperands.clear();
- }
- ++OIdx;
- }
- assert(StoreOperands.empty() && "Unfinished parameter store.");
- if (VTs.size() > 0)
- --OIdx;
- ++paramCount;
- continue;
- }
-
- // ByVal arguments
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
- assert(PTy && "Type of a byval parameter should be pointer");
- ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
-
- // declare .param .align <align> .b8 .param<n>[<size>];
- unsigned sz = Outs[OIdx].Flags.getByValSize();
- SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
- // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
- // so we don't need to worry about natural alignment or not.
- // See TargetLowering::LowerCallTo().
-
- // Enforce minumum alignment of 4 to work around ptxas miscompile
- // for sm_50+. See corresponding alignment adjustment in
- // emitFunctionParamList() for details.
- if (ArgAlign < 4)
- ArgAlign = 4;
- SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(sz, dl, MVT::i32), InFlag};
- Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
- DeclareParamOps);
- InFlag = Chain.getValue(1);
- for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
- EVT elemtype = VTs[j];
- int curOffset = Offsets[j];
- unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
- auto PtrVT = getPointerTy(DL);
- SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
- DAG.getConstant(curOffset, dl, PtrVT));
- SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
- MachinePointerInfo(), PartAlign);
- if (elemtype.getSizeInBits() < 16) {
- theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
- }
- SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CopyParamOps[] = { Chain,
- DAG.getConstant(paramCount, dl, MVT::i32),
- DAG.getConstant(curOffset, dl, MVT::i32),
- theVal, InFlag };
- Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
- CopyParamOps, elemtype,
- MachinePointerInfo(), /* Align */ 0,
- /* Volatile */ false, /* ReadMem */ false,
- /* WriteMem */ true, /* Size */ 0);
-
- InFlag = Chain.getValue(1);
- }
- ++paramCount;
- }
-
- GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
- unsigned retAlignment = 0;
-
- // Handle Result
- if (Ins.size() > 0) {
- SmallVector<EVT, 16> resvtparts;
- ComputeValueVTs(*this, DL, RetTy, resvtparts);
-
- // Declare
- // .param .align 16 .b8 retval0[<size-in-bytes>], or
- // .param .b<size-in-bits> retval0
- unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
- // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
- // these three types to match the logic in
- // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
- // Plus, this behavior is consistent with nvcc's.
- if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
- RetTy->isPointerTy()) {
- // Scalar needs to be at least 32bit wide
- if (resultsz < 32)
- resultsz = 32;
- SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(resultsz, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag };
- Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
- DeclareRetOps);
- InFlag = Chain.getValue(1);
- } else {
- retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
- SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue DeclareRetOps[] = { Chain,
- DAG.getConstant(retAlignment, dl, MVT::i32),
- DAG.getConstant(resultsz / 8, dl, MVT::i32),
- DAG.getConstant(0, dl, MVT::i32), InFlag };
- Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
- DeclareRetOps);
- InFlag = Chain.getValue(1);
- }
- }
-
- if (!Func) {
- // This is indirect function call case : PTX requires a prototype of the
- // form
- // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
- // to be emitted, and the label has to used as the last arg of call
- // instruction.
- // The prototype is embedded in a string and put as the operand for a
- // CallPrototype SDNode which will print out to the value of the string.
- SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
- const char *ProtoStr =
- nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
- SDValue ProtoOps[] = {
- Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
- };
- Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
- InFlag = Chain.getValue(1);
- }
- // Op to just print "call"
- SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue PrintCallOps[] = {
- Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
- };
- // We model convergent calls as separate opcodes.
- unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
- if (CLI.IsConvergent)
- Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
- : NVPTXISD::PrintConvergentCall;
- Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
- InFlag = Chain.getValue(1);
-
- // Ops to print out the function name
- SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallVoidOps[] = { Chain, Callee, InFlag };
- Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
- InFlag = Chain.getValue(1);
-
- // Ops to print out the param list
- SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgBeginOps[] = { Chain, InFlag };
- Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
- CallArgBeginOps);
- InFlag = Chain.getValue(1);
-
- for (unsigned i = 0, e = paramCount; i != e; ++i) {
- unsigned opcode;
- if (i == (e - 1))
- opcode = NVPTXISD::LastCallArg;
- else
- opcode = NVPTXISD::CallArg;
- SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(i, dl, MVT::i32), InFlag };
- Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
- InFlag = Chain.getValue(1);
- }
- SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue CallArgEndOps[] = { Chain,
- DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
- InFlag };
- Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
- InFlag = Chain.getValue(1);
-
- if (!Func) {
- SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue PrototypeOps[] = { Chain,
- DAG.getConstant(uniqueCallSite, dl, MVT::i32),
- InFlag };
- Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
- InFlag = Chain.getValue(1);
- }
-
- // Generate loads from param memory/moves from registers for result
- if (Ins.size() > 0) {
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
- assert(VTs.size() == Ins.size() && "Bad value decomposition");
-
- unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
- auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
-
- SmallVector<EVT, 6> LoadVTs;
- int VecIdx = -1; // Index of the first element of the vector.
-
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
- // 32-bits are sign extended or zero extended, depending on whether
- // they are signed or unsigned types.
- bool ExtendIntegerRetVal =
- RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
-
- for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- bool needTruncate = false;
- EVT TheLoadType = VTs[i];
- EVT EltType = Ins[i].VT;
- unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
- if (ExtendIntegerRetVal) {
- TheLoadType = MVT::i32;
- EltType = MVT::i32;
- needTruncate = true;
- } else if (TheLoadType.getSizeInBits() < 16) {
- if (VTs[i].isInteger())
- needTruncate = true;
- EltType = MVT::i16;
- }
-
- // Record index of the very first element of the vector.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
- VecIdx = i;
- }
-
- LoadVTs.push_back(EltType);
-
- if (VectorInfo[i] & PVF_LAST) {
- unsigned NumElts = LoadVTs.size();
- LoadVTs.push_back(MVT::Other);
- LoadVTs.push_back(MVT::Glue);
- NVPTXISD::NodeType Op;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::LoadParam;
- break;
- case 2:
- Op = NVPTXISD::LoadParamV2;
- break;
- case 4:
- Op = NVPTXISD::LoadParamV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
-
- SDValue LoadOperands[] = {
- Chain, DAG.getConstant(1, dl, MVT::i32),
- DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
- SDValue RetVal = DAG.getMemIntrinsicNode(
- Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign, /* Volatile */ false,
- /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
-
- for (unsigned j = 0; j < NumElts; ++j) {
- SDValue Ret = RetVal.getValue(j);
- if (needTruncate)
- Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
- InVals.push_back(Ret);
- }
- Chain = RetVal.getValue(NumElts);
- InFlag = RetVal.getValue(NumElts + 1);
-
- // Cleanup
- VecIdx = -1;
- LoadVTs.clear();
- }
- }
- }
-
- Chain = DAG.getCALLSEQ_END(Chain,
- DAG.getIntPtrConstant(uniqueCallSite, dl, true),
- DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
- true),
- InFlag, dl);
- uniqueCallSite++;
-
- // set isTailCall to false for now, until we figure out how to express
- // tail call optimization in PTX
- isTailCall = false;
- return Chain;
-}
-
-// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
-// (see LegalizeDAG.cpp). This is slow and uses local memory.
-// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
-SDValue
-NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
- SmallVector<SDValue, 8> Ops;
- unsigned NumOperands = Node->getNumOperands();
- for (unsigned i = 0; i < NumOperands; ++i) {
- SDValue SubOp = Node->getOperand(i);
- EVT VVT = SubOp.getNode()->getValueType(0);
- EVT EltVT = VVT.getVectorElementType();
- unsigned NumSubElem = VVT.getVectorNumElements();
- for (unsigned j = 0; j < NumSubElem; ++j) {
- Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
- DAG.getIntPtrConstant(j, dl)));
- }
- }
- return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
-}
-
-// We can init constant f16x2 with a single .b32 move. Normally it
-// would get lowered as two constant loads and vector-packing move.
-// mov.b16 %h1, 0x4000;
-// mov.b16 %h2, 0x3C00;
-// mov.b32 %hh2, {%h2, %h1};
-// Instead we want just a constant move:
-// mov.b32 %hh2, 0x40003C00
-//
-// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
-// generates good SASS in both cases.
-SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
- SelectionDAG &DAG) const {
- //return Op;
- if (!(Op->getValueType(0) == MVT::v2f16 &&
- isa<ConstantFPSDNode>(Op->getOperand(0)) &&
- isa<ConstantFPSDNode>(Op->getOperand(1))))
- return Op;
-
- APInt E0 =
- cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
- APInt E1 =
- cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
- SDValue Const =
- DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
- return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
-}
-
-SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Index = Op->getOperand(1);
- // Constant index will be matched by tablegen.
- if (isa<ConstantSDNode>(Index.getNode()))
- return Op;
-
- // Extract individual elements and select one of them.
- SDValue Vector = Op->getOperand(0);
- EVT VectorVT = Vector.getValueType();
- assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
- EVT EltVT = VectorVT.getVectorElementType();
-
- SDLoc dl(Op.getNode());
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
- DAG.getIntPtrConstant(0, dl));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
- DAG.getIntPtrConstant(1, dl));
- return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
- ISD::CondCode::SETEQ);
-}
-
-/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
-/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
-/// amount, or
-/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
-/// amount.
-SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
- unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
- if (VTBits == 32 && STI.getSmVersion() >= 35) {
- // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
- // {dHi, dLo} = {aHi, aLo} >> Amt
- // dHi = aHi >> Amt
- // dLo = shf.r.clamp aLo, aHi, Amt
-
- SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
- ShAmt);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- else {
- // {dHi, dLo} = {aHi, aLo} >> Amt
- // - if (Amt>=size) then
- // dLo = aHi >> (Amt-size)
- // dHi = aHi >> Amt (this is either all 0 or all 1)
- // else
- // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
- // dHi = aHi >> Amt
-
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-
- SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ISD::SETGE);
- SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which
-/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
-/// amount, or
-/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
-/// amount.
-SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Op.getNumOperands() == 3 && "Not a double-shift!");
- assert(Op.getOpcode() == ISD::SHL_PARTS);
-
- EVT VT = Op.getValueType();
- unsigned VTBits = VT.getSizeInBits();
- SDLoc dl(Op);
- SDValue ShOpLo = Op.getOperand(0);
- SDValue ShOpHi = Op.getOperand(1);
- SDValue ShAmt = Op.getOperand(2);
-
- if (VTBits == 32 && STI.getSmVersion() >= 35) {
- // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
- // {dHi, dLo} = {aHi, aLo} << Amt
- // dHi = shf.l.clamp aLo, aHi, Amt
- // dLo = aLo << Amt
-
- SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
- ShAmt);
- SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
- else {
- // {dHi, dLo} = {aHi, aLo} << Amt
- // - if (Amt>=size) then
- // dLo = aLo << Amt (all 0)
- // dLo = aLo << (Amt-size)
- // else
- // dLo = aLo << Amt
- // dHi = (aHi << Amt) | (aLo >> (size-Amt))
-
- SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32));
- SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
- SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
- DAG.getConstant(VTBits, dl, MVT::i32),
- ISD::SETGE);
- SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
-
- SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, dl);
- }
-}
-
-SDValue
-NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- switch (Op.getOpcode()) {
- case ISD::RETURNADDR:
- return SDValue();
- case ISD::FRAMEADDR:
- return SDValue();
- case ISD::GlobalAddress:
- return LowerGlobalAddress(Op, DAG);
- case ISD::INTRINSIC_W_CHAIN:
- return Op;
- case ISD::BUILD_VECTOR:
- return LowerBUILD_VECTOR(Op, DAG);
- case ISD::EXTRACT_SUBVECTOR:
- return Op;
- case ISD::EXTRACT_VECTOR_ELT:
- return LowerEXTRACT_VECTOR_ELT(Op, DAG);
- case ISD::CONCAT_VECTORS:
- return LowerCONCAT_VECTORS(Op, DAG);
- case ISD::STORE:
- return LowerSTORE(Op, DAG);
- case ISD::LOAD:
- return LowerLOAD(Op, DAG);
- case ISD::SHL_PARTS:
- return LowerShiftLeftParts(Op, DAG);
- case ISD::SRA_PARTS:
- case ISD::SRL_PARTS:
- return LowerShiftRightParts(Op, DAG);
- case ISD::SELECT:
- return LowerSelect(Op, DAG);
- default:
- llvm_unreachable("Custom lowering not defined for operation");
- }
-}
-
-SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
- SDValue Op0 = Op->getOperand(0);
- SDValue Op1 = Op->getOperand(1);
- SDValue Op2 = Op->getOperand(2);
- SDLoc DL(Op.getNode());
-
- assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
-
- Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
- Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
- SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
-
- return Trunc;
-}
-
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- return LowerLOADi1(Op, DAG);
-
- // v2f16 is legal, so we can't rely on legalizer to handle unaligned
- // loads and have to handle it here.
- if (Op.getValueType() == MVT::v2f16) {
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- Load->getAddressSpace(), Load->getAlignment())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, SDLoc(Op));
- }
- }
-
- return SDValue();
-}
-
-// v = ld i1* addr
-// =>
-// v1 = ld i8* addr (-> i16)
-// v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- LoadSDNode *LD = cast<LoadSDNode>(Node);
- SDLoc dl(Node);
- assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
- assert(Node->getValueType(0) == MVT::i1 &&
- "Custom lowering for i1 load only");
- SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
- LD->getPointerInfo(), LD->getAlignment(),
- LD->getMemOperand()->getFlags());
- SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
- // The legalizer (the caller) is expecting two values from the legalized
- // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
- // in LegalizeDAG.cpp which also uses MergeValues.
- SDValue Ops[] = { result, LD->getChain() };
- return DAG.getMergeValues(Ops, dl);
-}
-
-SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT VT = Store->getMemoryVT();
-
- if (VT == MVT::i1)
- return LowerSTOREi1(Op, DAG);
-
- // v2f16 is legal, so we can't rely on legalizer to handle unaligned
- // stores and have to handle it here.
- if (VT == MVT::v2f16 &&
- !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- Store->getAddressSpace(), Store->getAlignment()))
- return expandUnalignedStore(Store, DAG);
-
- if (VT.isVector())
- return LowerSTOREVector(Op, DAG);
-
- return SDValue();
-}
-
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
- SDNode *N = Op.getNode();
- SDValue Val = N->getOperand(1);
- SDLoc DL(N);
- EVT ValVT = Val.getValueType();
-
- if (ValVT.isVector()) {
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 stores of <2 x double> here
- // but I'm leaving that as a TODO for now.
- if (!ValVT.isSimple())
- return SDValue();
- switch (ValVT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
- // This is a "native" vector type
- break;
- }
-
- MemSDNode *MemSD = cast<MemSDNode>(N);
- const DataLayout &TD = DAG.getDataLayout();
-
- unsigned Align = MemSD->getAlignment();
- unsigned PrefAlign =
- TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
- if (Align < PrefAlign) {
- // This store is not sufficiently aligned, so bail out and let this vector
- // store be scalarized. Note that we may still be able to emit smaller
- // vector stores. For example, if we are storing a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return SDValue();
- }
-
- unsigned Opcode = 0;
- EVT EltVT = ValVT.getVectorElementType();
- unsigned NumElts = ValVT.getVectorNumElements();
-
- // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // stored type to i16 and propagate the "real" type as the memory type.
- bool NeedExt = false;
- if (EltVT.getSizeInBits() < 16)
- NeedExt = true;
-
- bool StoreF16x2 = false;
- switch (NumElts) {
- default:
- return SDValue();
- case 2:
- Opcode = NVPTXISD::StoreV2;
- break;
- case 4:
- Opcode = NVPTXISD::StoreV4;
- break;
- case 8:
- // v8f16 is a special case. PTX doesn't have st.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // store them with st.v4.b32.
- assert(EltVT == MVT::f16 && "Wrong type for the vector.");
- Opcode = NVPTXISD::StoreV4;
- StoreF16x2 = true;
- break;
- }
-
- SmallVector<SDValue, 8> Ops;
-
- // First is the chain
- Ops.push_back(N->getOperand(0));
-
- if (StoreF16x2) {
- // Combine f16,f16 -> v2f16
- NumElts /= 2;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
- DAG.getIntPtrConstant(i * 2, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
- DAG.getIntPtrConstant(i * 2 + 1, DL));
- SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
- Ops.push_back(V2);
- }
- } else {
- // Then the split values
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i, DL));
- if (NeedExt)
- ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
- Ops.push_back(ExtVal);
- }
- }
-
- // Then any remaining arguments
- Ops.append(N->op_begin() + 2, N->op_end());
-
- SDValue NewSt =
- DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
-
- // return DCI.CombineTo(N, NewSt, true);
- return NewSt;
- }
-
- return SDValue();
-}
-
-// st i1 v, addr
-// =>
-// v1 = zxt v to i16
-// st.u8 i16, addr
-SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
- StoreSDNode *ST = cast<StoreSDNode>(Node);
- SDValue Tmp1 = ST->getChain();
- SDValue Tmp2 = ST->getBasePtr();
- SDValue Tmp3 = ST->getValue();
- assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
- Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
- SDValue Result =
- DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
- ST->getAlignment(), ST->getMemOperand()->getFlags());
- return Result;
-}
-
-SDValue
-NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
- std::string ParamSym;
- raw_string_ostream ParamStr(ParamSym);
-
- ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
- ParamStr.flush();
-
- std::string *SavedStr =
- nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
- return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
-}
-
-// Check to see if the kernel argument is image*_t or sampler_t
-
-static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
- static const char *const specialTypes[] = { "struct._image2d_t",
- "struct._image3d_t",
- "struct._sampler_t" };
-
- Type *Ty = arg->getType();
- auto *PTy = dyn_cast<PointerType>(Ty);
-
- if (!PTy)
- return false;
-
- if (!context)
- return false;
-
- auto *STy = dyn_cast<StructType>(PTy->getElementType());
- if (!STy || STy->isLiteral())
- return false;
-
- return std::find(std::begin(specialTypes), std::end(specialTypes),
- STy->getName()) != std::end(specialTypes);
-}
-
-SDValue NVPTXTargetLowering::LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const DataLayout &DL = DAG.getDataLayout();
- auto PtrVT = getPointerTy(DAG.getDataLayout());
-
- const Function *F = MF.getFunction();
- const AttributeList &PAL = F->getAttributes();
- const TargetLowering *TLI = STI.getTargetLowering();
-
- SDValue Root = DAG.getRoot();
- std::vector<SDValue> OutChains;
-
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
-
- std::vector<Type *> argTypes;
- std::vector<const Argument *> theArgs;
- for (const Argument &I : F->args()) {
- theArgs.push_back(&I);
- argTypes.push_back(I.getType());
- }
- // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
- // Ins.size() will be larger
- // * if there is an aggregate argument with multiple fields (each field
- // showing up separately in Ins)
- // * if there is a vector argument with more than typical vector-length
- // elements (generally if more than 4) where each vector element is
- // individually present in Ins.
- // So a different index should be used for indexing into Ins.
- // See similar issue in LowerCall.
- unsigned InsIdx = 0;
-
- int idx = 0;
- for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
- Type *Ty = argTypes[i];
-
- // If the kernel argument is image*_t or sampler_t, convert it to
- // a i32 constant holding the parameter position. This can later
- // matched in the AsmPrinter to output the correct mangled name.
- if (isImageOrSamplerVal(
- theArgs[i],
- (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
- : nullptr))) {
- assert(isKernelFunction(*F) &&
- "Only kernels can have image/sampler params");
- InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
- continue;
- }
-
- if (theArgs[i]->use_empty()) {
- // argument is dead
- if (Ty->isAggregateType()) {
- SmallVector<EVT, 16> vtparts;
-
- ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
- assert(vtparts.size() > 0 && "empty aggregate type not expected");
- for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
- ++parti) {
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- ++InsIdx;
- }
- if (vtparts.size() > 0)
- --InsIdx;
- continue;
- }
- if (Ty->isVectorTy()) {
- EVT ObjectVT = getValueType(DL, Ty);
- unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
- for (unsigned parti = 0; parti < NumRegs; ++parti) {
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- ++InsIdx;
- }
- if (NumRegs > 0)
- --InsIdx;
- continue;
- }
- InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
- continue;
- }
-
- // In the following cases, assign a node order of "idx+1"
- // to newly created nodes. The SDNodes for params have to
- // appear in the same order as their order of appearance
- // in the original function. "idx+1" holds that order.
- if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
- bool aggregateIsPacked = false;
- if (StructType *STy = dyn_cast<StructType>(Ty))
- aggregateIsPacked = STy->isPacked();
-
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
- assert(VTs.size() > 0 && "Unexpected empty type.");
- auto VectorInfo =
- VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
-
- SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
- int VecIdx = -1; // Index of the first element of the current vector.
- for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
- if (VectorInfo[parti] & PVF_FIRST) {
- assert(VecIdx == -1 && "Orphaned vector.");
- VecIdx = parti;
- }
-
- // That's the last element of this store op.
- if (VectorInfo[parti] & PVF_LAST) {
- unsigned NumElts = parti - VecIdx + 1;
- EVT EltVT = VTs[parti];
- // i1 is loaded/stored as i8.
- EVT LoadVT = EltVT;
- if (EltVT == MVT::i1)
- LoadVT = MVT::i8;
- else if (EltVT == MVT::v2f16)
- // getLoad needs a vector type, but it can't handle
- // vectors which contain v2f16 elements. So we must load
- // using i32 here and then bitcast back.
- LoadVT = MVT::i32;
-
- EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
- SDValue VecAddr =
- DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
- DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
- Value *srcValue = Constant::getNullValue(PointerType::get(
- EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
- SDValue P =
- DAG.getLoad(VecVT, dl, Root, VecAddr,
- MachinePointerInfo(srcValue), aggregateIsPacked,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
- if (P.getNode())
- P.getNode()->setIROrder(idx + 1);
- for (unsigned j = 0; j < NumElts; ++j) {
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
- DAG.getIntPtrConstant(j, dl));
- // We've loaded i1 as an i8 and now must truncate it back to i1
- if (EltVT == MVT::i1)
- Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
- // v2f16 was loaded as an i32. Now we must bitcast it back.
- else if (EltVT == MVT::v2f16)
- Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
- // Extend the element if necessary (e.g. an i8 is loaded
- // into an i16 register)
- if (Ins[InsIdx].VT.isInteger() &&
- Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
- unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND;
- Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
- }
- InVals.push_back(Elt);
- }
-
- // Reset vector tracking state.
- VecIdx = -1;
- }
- ++InsIdx;
- }
- if (VTs.size() > 0)
- --InsIdx;
- continue;
- }
-
- // Param has ByVal attribute
- // Return MoveParam(param symbol).
- // Ideally, the param symbol can be returned directly,
- // but when SDNode builder decides to use it in a CopyToReg(),
- // machine instruction fails because TargetExternalSymbol
- // (not lowered) is target dependent, and CopyToReg assumes
- // the source is lowered.
- EVT ObjectVT = getValueType(DL, Ty);
- assert(ObjectVT == Ins[InsIdx].VT &&
- "Ins type did not match function type");
- SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
- SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
- if (p.getNode())
- p.getNode()->setIROrder(idx + 1);
- InVals.push_back(p);
- }
-
- // Clang will check explicit VarArg and issue error if any. However, Clang
- // will let code with
- // implicit var arg like f() pass. See bug 617733.
- // We treat this case as if the arg list is empty.
- // if (F.isVarArg()) {
- // assert(0 && "VarArg not supported yet!");
- //}
-
- if (!OutChains.empty())
- DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
-
- return Chain;
-}
-
-SDValue
-NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &dl, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- Type *RetTy = MF.getFunction()->getReturnType();
-
- bool isABI = (STI.getSmVersion() >= 20);
- assert(isABI && "Non-ABI compilation is not supported");
- if (!isABI)
- return Chain;
-
- const DataLayout DL = DAG.getDataLayout();
- SmallVector<EVT, 16> VTs;
- SmallVector<uint64_t, 16> Offsets;
- ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
- assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
-
- auto VectorInfo = VectorizePTXValueVTs(
- VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
-
- // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
- // 32-bits are sign extended or zero extended, depending on whether
- // they are signed or unsigned types.
- bool ExtendIntegerRetVal =
- RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
-
- SmallVector<SDValue, 6> StoreOperands;
- for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
- // New load/store. Record chain and offset operands.
- if (VectorInfo[i] & PVF_FIRST) {
- assert(StoreOperands.empty() && "Orphaned operand list.");
- StoreOperands.push_back(Chain);
- StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
- }
-
- SDValue RetVal = OutVals[i];
- if (ExtendIntegerRetVal) {
- RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND,
- dl, MVT::i32, RetVal);
- } else if (RetVal.getValueSizeInBits() < 16) {
- // Use 16-bit registers for small load-stores as it's the
- // smallest general purpose register size supported by NVPTX.
- RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
- }
-
- // Record the value to return.
- StoreOperands.push_back(RetVal);
-
- // That's the last element of this store op.
- if (VectorInfo[i] & PVF_LAST) {
- NVPTXISD::NodeType Op;
- unsigned NumElts = StoreOperands.size() - 2;
- switch (NumElts) {
- case 1:
- Op = NVPTXISD::StoreRetval;
- break;
- case 2:
- Op = NVPTXISD::StoreRetvalV2;
- break;
- case 4:
- Op = NVPTXISD::StoreRetvalV4;
- break;
- default:
- llvm_unreachable("Invalid vector info.");
- }
-
- // Adjust type of load/store op if we've extended the scalar
- // return value.
- EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
- Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
- StoreOperands, TheStoreType,
- MachinePointerInfo(), /* Align */ 1,
- /* Volatile */ false, /* ReadMem */ false,
- /* WriteMem */ true, /* Size */ 0);
- // Cleanup vector state.
- StoreOperands.clear();
- }
- }
-
- return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
-}
-
-void NVPTXTargetLowering::LowerAsmOperandForConstraint(
- SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
- SelectionDAG &DAG) const {
- if (Constraint.length() > 1)
- return;
- else
- TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
- switch (Intrinsic) {
- default:
- return 0;
-
- case Intrinsic::nvvm_tex_1d_v4f32_s32:
- return NVPTXISD::Tex1DFloatS32;
- case Intrinsic::nvvm_tex_1d_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloat;
- case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
- return NVPTXISD::Tex1DFloatFloatGrad;
- case Intrinsic::nvvm_tex_1d_v4s32_s32:
- return NVPTXISD::Tex1DS32S32;
- case Intrinsic::nvvm_tex_1d_v4s32_f32:
- return NVPTXISD::Tex1DS32Float;
- case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
- return NVPTXISD::Tex1DS32FloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
- return NVPTXISD::Tex1DS32FloatGrad;
- case Intrinsic::nvvm_tex_1d_v4u32_s32:
- return NVPTXISD::Tex1DU32S32;
- case Intrinsic::nvvm_tex_1d_v4u32_f32:
- return NVPTXISD::Tex1DU32Float;
- case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
- return NVPTXISD::Tex1DU32FloatLevel;
- case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
- return NVPTXISD::Tex1DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
- return NVPTXISD::Tex1DArrayFloatS32;
- case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloat;
- case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
- return NVPTXISD::Tex1DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
- return NVPTXISD::Tex1DArrayS32S32;
- case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32Float;
- case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
- return NVPTXISD::Tex1DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
- return NVPTXISD::Tex1DArrayU32S32;
- case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32Float;
- case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
- return NVPTXISD::Tex1DArrayU32FloatGrad;
-
- case Intrinsic::nvvm_tex_2d_v4f32_s32:
- return NVPTXISD::Tex2DFloatS32;
- case Intrinsic::nvvm_tex_2d_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloat;
- case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
- return NVPTXISD::Tex2DFloatFloatGrad;
- case Intrinsic::nvvm_tex_2d_v4s32_s32:
- return NVPTXISD::Tex2DS32S32;
- case Intrinsic::nvvm_tex_2d_v4s32_f32:
- return NVPTXISD::Tex2DS32Float;
- case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
- return NVPTXISD::Tex2DS32FloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
- return NVPTXISD::Tex2DS32FloatGrad;
- case Intrinsic::nvvm_tex_2d_v4u32_s32:
- return NVPTXISD::Tex2DU32S32;
- case Intrinsic::nvvm_tex_2d_v4u32_f32:
- return NVPTXISD::Tex2DU32Float;
- case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
- return NVPTXISD::Tex2DU32FloatLevel;
- case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
- return NVPTXISD::Tex2DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
- return NVPTXISD::Tex2DArrayFloatS32;
- case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloat;
- case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
- return NVPTXISD::Tex2DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
- return NVPTXISD::Tex2DArrayS32S32;
- case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32Float;
- case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
- return NVPTXISD::Tex2DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
- return NVPTXISD::Tex2DArrayU32S32;
- case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32Float;
- case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
- return NVPTXISD::Tex2DArrayU32FloatGrad;
-
- case Intrinsic::nvvm_tex_3d_v4f32_s32:
- return NVPTXISD::Tex3DFloatS32;
- case Intrinsic::nvvm_tex_3d_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloat;
- case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
- return NVPTXISD::Tex3DFloatFloatGrad;
- case Intrinsic::nvvm_tex_3d_v4s32_s32:
- return NVPTXISD::Tex3DS32S32;
- case Intrinsic::nvvm_tex_3d_v4s32_f32:
- return NVPTXISD::Tex3DS32Float;
- case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
- return NVPTXISD::Tex3DS32FloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
- return NVPTXISD::Tex3DS32FloatGrad;
- case Intrinsic::nvvm_tex_3d_v4u32_s32:
- return NVPTXISD::Tex3DU32S32;
- case Intrinsic::nvvm_tex_3d_v4u32_f32:
- return NVPTXISD::Tex3DU32Float;
- case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
- return NVPTXISD::Tex3DU32FloatLevel;
- case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
- return NVPTXISD::Tex3DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_cube_v4f32_f32:
- return NVPTXISD::TexCubeFloatFloat;
- case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
- return NVPTXISD::TexCubeFloatFloatLevel;
- case Intrinsic::nvvm_tex_cube_v4s32_f32:
- return NVPTXISD::TexCubeS32Float;
- case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
- return NVPTXISD::TexCubeS32FloatLevel;
- case Intrinsic::nvvm_tex_cube_v4u32_f32:
- return NVPTXISD::TexCubeU32Float;
- case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
- return NVPTXISD::TexCubeU32FloatLevel;
-
- case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
- return NVPTXISD::TexCubeArrayFloatFloat;
- case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
- return NVPTXISD::TexCubeArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
- return NVPTXISD::TexCubeArrayS32Float;
- case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
- return NVPTXISD::TexCubeArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
- return NVPTXISD::TexCubeArrayU32Float;
- case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
- return NVPTXISD::TexCubeArrayU32FloatLevel;
-
- case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
- return NVPTXISD::Tld4R2DFloatFloat;
- case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
- return NVPTXISD::Tld4G2DFloatFloat;
- case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
- return NVPTXISD::Tld4B2DFloatFloat;
- case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
- return NVPTXISD::Tld4A2DFloatFloat;
- case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
- return NVPTXISD::Tld4R2DS64Float;
- case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
- return NVPTXISD::Tld4G2DS64Float;
- case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
- return NVPTXISD::Tld4B2DS64Float;
- case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
- return NVPTXISD::Tld4A2DS64Float;
- case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
- return NVPTXISD::Tld4R2DU64Float;
- case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
- return NVPTXISD::Tld4G2DU64Float;
- case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
- return NVPTXISD::Tld4B2DU64Float;
- case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
- return NVPTXISD::Tld4A2DU64Float;
-
- case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
- return NVPTXISD::TexUnified1DFloatS32;
- case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloat;
- case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
- return NVPTXISD::TexUnified1DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
- return NVPTXISD::TexUnified1DS32S32;
- case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
- return NVPTXISD::TexUnified1DS32Float;
- case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
- return NVPTXISD::TexUnified1DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
- return NVPTXISD::TexUnified1DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
- return NVPTXISD::TexUnified1DU32S32;
- case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
- return NVPTXISD::TexUnified1DU32Float;
- case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
- return NVPTXISD::TexUnified1DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
- return NVPTXISD::TexUnified1DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
- return NVPTXISD::TexUnified1DArrayFloatS32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
- return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
- return NVPTXISD::TexUnified1DArrayS32S32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32Float;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
- return NVPTXISD::TexUnified1DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
- return NVPTXISD::TexUnified1DArrayU32S32;
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32Float;
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
- return NVPTXISD::TexUnified1DArrayU32FloatGrad;
-
- case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
- return NVPTXISD::TexUnified2DFloatS32;
- case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloat;
- case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
- return NVPTXISD::TexUnified2DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
- return NVPTXISD::TexUnified2DS32S32;
- case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
- return NVPTXISD::TexUnified2DS32Float;
- case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
- return NVPTXISD::TexUnified2DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
- return NVPTXISD::TexUnified2DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
- return NVPTXISD::TexUnified2DU32S32;
- case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
- return NVPTXISD::TexUnified2DU32Float;
- case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
- return NVPTXISD::TexUnified2DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
- return NVPTXISD::TexUnified2DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
- return NVPTXISD::TexUnified2DArrayFloatS32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
- return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
- return NVPTXISD::TexUnified2DArrayS32S32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32Float;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
- return NVPTXISD::TexUnified2DArrayS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
- return NVPTXISD::TexUnified2DArrayU32S32;
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32Float;
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
- return NVPTXISD::TexUnified2DArrayU32FloatGrad;
-
- case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
- return NVPTXISD::TexUnified3DFloatS32;
- case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloat;
- case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
- return NVPTXISD::TexUnified3DFloatFloatGrad;
- case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
- return NVPTXISD::TexUnified3DS32S32;
- case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
- return NVPTXISD::TexUnified3DS32Float;
- case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
- return NVPTXISD::TexUnified3DS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
- return NVPTXISD::TexUnified3DS32FloatGrad;
- case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
- return NVPTXISD::TexUnified3DU32S32;
- case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
- return NVPTXISD::TexUnified3DU32Float;
- case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
- return NVPTXISD::TexUnified3DU32FloatLevel;
- case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
- return NVPTXISD::TexUnified3DU32FloatGrad;
-
- case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeFloatFloat;
- case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeS32Float;
- case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeU32Float;
- case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeU32FloatLevel;
-
- case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
- return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeArrayS32Float;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
- return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
- case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeArrayU32Float;
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
- return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
-
- case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedR2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedG2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedB2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
- return NVPTXISD::Tld4UnifiedA2DFloatFloat;
- case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedR2DS64Float;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedG2DS64Float;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedB2DS64Float;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
- return NVPTXISD::Tld4UnifiedA2DS64Float;
- case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedR2DU64Float;
- case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedG2DU64Float;
- case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedB2DU64Float;
- case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
- return NVPTXISD::Tld4UnifiedA2DU64Float;
- }
-}
-
-static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
- switch (Intrinsic) {
- default:
- return 0;
- case Intrinsic::nvvm_suld_1d_i8_clamp:
- return NVPTXISD::Suld1DI8Clamp;
- case Intrinsic::nvvm_suld_1d_i16_clamp:
- return NVPTXISD::Suld1DI16Clamp;
- case Intrinsic::nvvm_suld_1d_i32_clamp:
- return NVPTXISD::Suld1DI32Clamp;
- case Intrinsic::nvvm_suld_1d_i64_clamp:
- return NVPTXISD::Suld1DI64Clamp;
- case Intrinsic::nvvm_suld_1d_v2i8_clamp:
- return NVPTXISD::Suld1DV2I8Clamp;
- case Intrinsic::nvvm_suld_1d_v2i16_clamp:
- return NVPTXISD::Suld1DV2I16Clamp;
- case Intrinsic::nvvm_suld_1d_v2i32_clamp:
- return NVPTXISD::Suld1DV2I32Clamp;
- case Intrinsic::nvvm_suld_1d_v2i64_clamp:
- return NVPTXISD::Suld1DV2I64Clamp;
- case Intrinsic::nvvm_suld_1d_v4i8_clamp:
- return NVPTXISD::Suld1DV4I8Clamp;
- case Intrinsic::nvvm_suld_1d_v4i16_clamp:
- return NVPTXISD::Suld1DV4I16Clamp;
- case Intrinsic::nvvm_suld_1d_v4i32_clamp:
- return NVPTXISD::Suld1DV4I32Clamp;
- case Intrinsic::nvvm_suld_1d_array_i8_clamp:
- return NVPTXISD::Suld1DArrayI8Clamp;
- case Intrinsic::nvvm_suld_1d_array_i16_clamp:
- return NVPTXISD::Suld1DArrayI16Clamp;
- case Intrinsic::nvvm_suld_1d_array_i32_clamp:
- return NVPTXISD::Suld1DArrayI32Clamp;
- case Intrinsic::nvvm_suld_1d_array_i64_clamp:
- return NVPTXISD::Suld1DArrayI64Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
- return NVPTXISD::Suld1DArrayV2I8Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
- return NVPTXISD::Suld1DArrayV2I16Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
- return NVPTXISD::Suld1DArrayV2I32Clamp;
- case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
- return NVPTXISD::Suld1DArrayV2I64Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
- return NVPTXISD::Suld1DArrayV4I8Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
- return NVPTXISD::Suld1DArrayV4I16Clamp;
- case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
- return NVPTXISD::Suld1DArrayV4I32Clamp;
- case Intrinsic::nvvm_suld_2d_i8_clamp:
- return NVPTXISD::Suld2DI8Clamp;
- case Intrinsic::nvvm_suld_2d_i16_clamp:
- return NVPTXISD::Suld2DI16Clamp;
- case Intrinsic::nvvm_suld_2d_i32_clamp:
- return NVPTXISD::Suld2DI32Clamp;
- case Intrinsic::nvvm_suld_2d_i64_clamp:
- return NVPTXISD::Suld2DI64Clamp;
- case Intrinsic::nvvm_suld_2d_v2i8_clamp:
- return NVPTXISD::Suld2DV2I8Clamp;
- case Intrinsic::nvvm_suld_2d_v2i16_clamp:
- return NVPTXISD::Suld2DV2I16Clamp;
- case Intrinsic::nvvm_suld_2d_v2i32_clamp:
- return NVPTXISD::Suld2DV2I32Clamp;
- case Intrinsic::nvvm_suld_2d_v2i64_clamp:
- return NVPTXISD::Suld2DV2I64Clamp;
- case Intrinsic::nvvm_suld_2d_v4i8_clamp:
- return NVPTXISD::Suld2DV4I8Clamp;
- case Intrinsic::nvvm_suld_2d_v4i16_clamp:
- return NVPTXISD::Suld2DV4I16Clamp;
- case Intrinsic::nvvm_suld_2d_v4i32_clamp:
- return NVPTXISD::Suld2DV4I32Clamp;
- case Intrinsic::nvvm_suld_2d_array_i8_clamp:
- return NVPTXISD::Suld2DArrayI8Clamp;
- case Intrinsic::nvvm_suld_2d_array_i16_clamp:
- return NVPTXISD::Suld2DArrayI16Clamp;
- case Intrinsic::nvvm_suld_2d_array_i32_clamp:
- return NVPTXISD::Suld2DArrayI32Clamp;
- case Intrinsic::nvvm_suld_2d_array_i64_clamp:
- return NVPTXISD::Suld2DArrayI64Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
- return NVPTXISD::Suld2DArrayV2I8Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
- return NVPTXISD::Suld2DArrayV2I16Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
- return NVPTXISD::Suld2DArrayV2I32Clamp;
- case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
- return NVPTXISD::Suld2DArrayV2I64Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
- return NVPTXISD::Suld2DArrayV4I8Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
- return NVPTXISD::Suld2DArrayV4I16Clamp;
- case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
- return NVPTXISD::Suld2DArrayV4I32Clamp;
- case Intrinsic::nvvm_suld_3d_i8_clamp:
- return NVPTXISD::Suld3DI8Clamp;
- case Intrinsic::nvvm_suld_3d_i16_clamp:
- return NVPTXISD::Suld3DI16Clamp;
- case Intrinsic::nvvm_suld_3d_i32_clamp:
- return NVPTXISD::Suld3DI32Clamp;
- case Intrinsic::nvvm_suld_3d_i64_clamp:
- return NVPTXISD::Suld3DI64Clamp;
- case Intrinsic::nvvm_suld_3d_v2i8_clamp:
- return NVPTXISD::Suld3DV2I8Clamp;
- case Intrinsic::nvvm_suld_3d_v2i16_clamp:
- return NVPTXISD::Suld3DV2I16Clamp;
- case Intrinsic::nvvm_suld_3d_v2i32_clamp:
- return NVPTXISD::Suld3DV2I32Clamp;
- case Intrinsic::nvvm_suld_3d_v2i64_clamp:
- return NVPTXISD::Suld3DV2I64Clamp;
- case Intrinsic::nvvm_suld_3d_v4i8_clamp:
- return NVPTXISD::Suld3DV4I8Clamp;
- case Intrinsic::nvvm_suld_3d_v4i16_clamp:
- return NVPTXISD::Suld3DV4I16Clamp;
- case Intrinsic::nvvm_suld_3d_v4i32_clamp:
- return NVPTXISD::Suld3DV4I32Clamp;
- case Intrinsic::nvvm_suld_1d_i8_trap:
- return NVPTXISD::Suld1DI8Trap;
- case Intrinsic::nvvm_suld_1d_i16_trap:
- return NVPTXISD::Suld1DI16Trap;
- case Intrinsic::nvvm_suld_1d_i32_trap:
- return NVPTXISD::Suld1DI32Trap;
- case Intrinsic::nvvm_suld_1d_i64_trap:
- return NVPTXISD::Suld1DI64Trap;
- case Intrinsic::nvvm_suld_1d_v2i8_trap:
- return NVPTXISD::Suld1DV2I8Trap;
- case Intrinsic::nvvm_suld_1d_v2i16_trap:
- return NVPTXISD::Suld1DV2I16Trap;
- case Intrinsic::nvvm_suld_1d_v2i32_trap:
- return NVPTXISD::Suld1DV2I32Trap;
- case Intrinsic::nvvm_suld_1d_v2i64_trap:
- return NVPTXISD::Suld1DV2I64Trap;
- case Intrinsic::nvvm_suld_1d_v4i8_trap:
- return NVPTXISD::Suld1DV4I8Trap;
- case Intrinsic::nvvm_suld_1d_v4i16_trap:
- return NVPTXISD::Suld1DV4I16Trap;
- case Intrinsic::nvvm_suld_1d_v4i32_trap:
- return NVPTXISD::Suld1DV4I32Trap;
- case Intrinsic::nvvm_suld_1d_array_i8_trap:
- return NVPTXISD::Suld1DArrayI8Trap;
- case Intrinsic::nvvm_suld_1d_array_i16_trap:
- return NVPTXISD::Suld1DArrayI16Trap;
- case Intrinsic::nvvm_suld_1d_array_i32_trap:
- return NVPTXISD::Suld1DArrayI32Trap;
- case Intrinsic::nvvm_suld_1d_array_i64_trap:
- return NVPTXISD::Suld1DArrayI64Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
- return NVPTXISD::Suld1DArrayV2I8Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
- return NVPTXISD::Suld1DArrayV2I16Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
- return NVPTXISD::Suld1DArrayV2I32Trap;
- case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
- return NVPTXISD::Suld1DArrayV2I64Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
- return NVPTXISD::Suld1DArrayV4I8Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
- return NVPTXISD::Suld1DArrayV4I16Trap;
- case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
- return NVPTXISD::Suld1DArrayV4I32Trap;
- case Intrinsic::nvvm_suld_2d_i8_trap:
- return NVPTXISD::Suld2DI8Trap;
- case Intrinsic::nvvm_suld_2d_i16_trap:
- return NVPTXISD::Suld2DI16Trap;
- case Intrinsic::nvvm_suld_2d_i32_trap:
- return NVPTXISD::Suld2DI32Trap;
- case Intrinsic::nvvm_suld_2d_i64_trap:
- return NVPTXISD::Suld2DI64Trap;
- case Intrinsic::nvvm_suld_2d_v2i8_trap:
- return NVPTXISD::Suld2DV2I8Trap;
- case Intrinsic::nvvm_suld_2d_v2i16_trap:
- return NVPTXISD::Suld2DV2I16Trap;
- case Intrinsic::nvvm_suld_2d_v2i32_trap:
- return NVPTXISD::Suld2DV2I32Trap;
- case Intrinsic::nvvm_suld_2d_v2i64_trap:
- return NVPTXISD::Suld2DV2I64Trap;
- case Intrinsic::nvvm_suld_2d_v4i8_trap:
- return NVPTXISD::Suld2DV4I8Trap;
- case Intrinsic::nvvm_suld_2d_v4i16_trap:
- return NVPTXISD::Suld2DV4I16Trap;
- case Intrinsic::nvvm_suld_2d_v4i32_trap:
- return NVPTXISD::Suld2DV4I32Trap;
- case Intrinsic::nvvm_suld_2d_array_i8_trap:
- return NVPTXISD::Suld2DArrayI8Trap;
- case Intrinsic::nvvm_suld_2d_array_i16_trap:
- return NVPTXISD::Suld2DArrayI16Trap;
- case Intrinsic::nvvm_suld_2d_array_i32_trap:
- return NVPTXISD::Suld2DArrayI32Trap;
- case Intrinsic::nvvm_suld_2d_array_i64_trap:
- return NVPTXISD::Suld2DArrayI64Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
- return NVPTXISD::Suld2DArrayV2I8Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
- return NVPTXISD::Suld2DArrayV2I16Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
- return NVPTXISD::Suld2DArrayV2I32Trap;
- case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
- return NVPTXISD::Suld2DArrayV2I64Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
- return NVPTXISD::Suld2DArrayV4I8Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
- return NVPTXISD::Suld2DArrayV4I16Trap;
- case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
- return NVPTXISD::Suld2DArrayV4I32Trap;
- case Intrinsic::nvvm_suld_3d_i8_trap:
- return NVPTXISD::Suld3DI8Trap;
- case Intrinsic::nvvm_suld_3d_i16_trap:
- return NVPTXISD::Suld3DI16Trap;
- case Intrinsic::nvvm_suld_3d_i32_trap:
- return NVPTXISD::Suld3DI32Trap;
- case Intrinsic::nvvm_suld_3d_i64_trap:
- return NVPTXISD::Suld3DI64Trap;
- case Intrinsic::nvvm_suld_3d_v2i8_trap:
- return NVPTXISD::Suld3DV2I8Trap;
- case Intrinsic::nvvm_suld_3d_v2i16_trap:
- return NVPTXISD::Suld3DV2I16Trap;
- case Intrinsic::nvvm_suld_3d_v2i32_trap:
- return NVPTXISD::Suld3DV2I32Trap;
- case Intrinsic::nvvm_suld_3d_v2i64_trap:
- return NVPTXISD::Suld3DV2I64Trap;
- case Intrinsic::nvvm_suld_3d_v4i8_trap:
- return NVPTXISD::Suld3DV4I8Trap;
- case Intrinsic::nvvm_suld_3d_v4i16_trap:
- return NVPTXISD::Suld3DV4I16Trap;
- case Intrinsic::nvvm_suld_3d_v4i32_trap:
- return NVPTXISD::Suld3DV4I32Trap;
- case Intrinsic::nvvm_suld_1d_i8_zero:
- return NVPTXISD::Suld1DI8Zero;
- case Intrinsic::nvvm_suld_1d_i16_zero:
- return NVPTXISD::Suld1DI16Zero;
- case Intrinsic::nvvm_suld_1d_i32_zero:
- return NVPTXISD::Suld1DI32Zero;
- case Intrinsic::nvvm_suld_1d_i64_zero:
- return NVPTXISD::Suld1DI64Zero;
- case Intrinsic::nvvm_suld_1d_v2i8_zero:
- return NVPTXISD::Suld1DV2I8Zero;
- case Intrinsic::nvvm_suld_1d_v2i16_zero:
- return NVPTXISD::Suld1DV2I16Zero;
- case Intrinsic::nvvm_suld_1d_v2i32_zero:
- return NVPTXISD::Suld1DV2I32Zero;
- case Intrinsic::nvvm_suld_1d_v2i64_zero:
- return NVPTXISD::Suld1DV2I64Zero;
- case Intrinsic::nvvm_suld_1d_v4i8_zero:
- return NVPTXISD::Suld1DV4I8Zero;
- case Intrinsic::nvvm_suld_1d_v4i16_zero:
- return NVPTXISD::Suld1DV4I16Zero;
- case Intrinsic::nvvm_suld_1d_v4i32_zero:
- return NVPTXISD::Suld1DV4I32Zero;
- case Intrinsic::nvvm_suld_1d_array_i8_zero:
- return NVPTXISD::Suld1DArrayI8Zero;
- case Intrinsic::nvvm_suld_1d_array_i16_zero:
- return NVPTXISD::Suld1DArrayI16Zero;
- case Intrinsic::nvvm_suld_1d_array_i32_zero:
- return NVPTXISD::Suld1DArrayI32Zero;
- case Intrinsic::nvvm_suld_1d_array_i64_zero:
- return NVPTXISD::Suld1DArrayI64Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
- return NVPTXISD::Suld1DArrayV2I8Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
- return NVPTXISD::Suld1DArrayV2I16Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
- return NVPTXISD::Suld1DArrayV2I32Zero;
- case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
- return NVPTXISD::Suld1DArrayV2I64Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
- return NVPTXISD::Suld1DArrayV4I8Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
- return NVPTXISD::Suld1DArrayV4I16Zero;
- case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
- return NVPTXISD::Suld1DArrayV4I32Zero;
- case Intrinsic::nvvm_suld_2d_i8_zero:
- return NVPTXISD::Suld2DI8Zero;
- case Intrinsic::nvvm_suld_2d_i16_zero:
- return NVPTXISD::Suld2DI16Zero;
- case Intrinsic::nvvm_suld_2d_i32_zero:
- return NVPTXISD::Suld2DI32Zero;
- case Intrinsic::nvvm_suld_2d_i64_zero:
- return NVPTXISD::Suld2DI64Zero;
- case Intrinsic::nvvm_suld_2d_v2i8_zero:
- return NVPTXISD::Suld2DV2I8Zero;
- case Intrinsic::nvvm_suld_2d_v2i16_zero:
- return NVPTXISD::Suld2DV2I16Zero;
- case Intrinsic::nvvm_suld_2d_v2i32_zero:
- return NVPTXISD::Suld2DV2I32Zero;
- case Intrinsic::nvvm_suld_2d_v2i64_zero:
- return NVPTXISD::Suld2DV2I64Zero;
- case Intrinsic::nvvm_suld_2d_v4i8_zero:
- return NVPTXISD::Suld2DV4I8Zero;
- case Intrinsic::nvvm_suld_2d_v4i16_zero:
- return NVPTXISD::Suld2DV4I16Zero;
- case Intrinsic::nvvm_suld_2d_v4i32_zero:
- return NVPTXISD::Suld2DV4I32Zero;
- case Intrinsic::nvvm_suld_2d_array_i8_zero:
- return NVPTXISD::Suld2DArrayI8Zero;
- case Intrinsic::nvvm_suld_2d_array_i16_zero:
- return NVPTXISD::Suld2DArrayI16Zero;
- case Intrinsic::nvvm_suld_2d_array_i32_zero:
- return NVPTXISD::Suld2DArrayI32Zero;
- case Intrinsic::nvvm_suld_2d_array_i64_zero:
- return NVPTXISD::Suld2DArrayI64Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
- return NVPTXISD::Suld2DArrayV2I8Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
- return NVPTXISD::Suld2DArrayV2I16Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
- return NVPTXISD::Suld2DArrayV2I32Zero;
- case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
- return NVPTXISD::Suld2DArrayV2I64Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
- return NVPTXISD::Suld2DArrayV4I8Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
- return NVPTXISD::Suld2DArrayV4I16Zero;
- case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
- return NVPTXISD::Suld2DArrayV4I32Zero;
- case Intrinsic::nvvm_suld_3d_i8_zero:
- return NVPTXISD::Suld3DI8Zero;
- case Intrinsic::nvvm_suld_3d_i16_zero:
- return NVPTXISD::Suld3DI16Zero;
- case Intrinsic::nvvm_suld_3d_i32_zero:
- return NVPTXISD::Suld3DI32Zero;
- case Intrinsic::nvvm_suld_3d_i64_zero:
- return NVPTXISD::Suld3DI64Zero;
- case Intrinsic::nvvm_suld_3d_v2i8_zero:
- return NVPTXISD::Suld3DV2I8Zero;
- case Intrinsic::nvvm_suld_3d_v2i16_zero:
- return NVPTXISD::Suld3DV2I16Zero;
- case Intrinsic::nvvm_suld_3d_v2i32_zero:
- return NVPTXISD::Suld3DV2I32Zero;
- case Intrinsic::nvvm_suld_3d_v2i64_zero:
- return NVPTXISD::Suld3DV2I64Zero;
- case Intrinsic::nvvm_suld_3d_v4i8_zero:
- return NVPTXISD::Suld3DV4I8Zero;
- case Intrinsic::nvvm_suld_3d_v4i16_zero:
- return NVPTXISD::Suld3DV4I16Zero;
- case Intrinsic::nvvm_suld_3d_v4i32_zero:
- return NVPTXISD::Suld3DV4I32Zero;
- }
-}
-
-// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
-// TgtMemIntrinsic
-// because we need the information that is only available in the "Value" type
-// of destination
-// pointer. In particular, the address space information.
-bool NVPTXTargetLowering::getTgtMemIntrinsic(
- IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
- switch (Intrinsic) {
- default:
- return false;
-
- case Intrinsic::nvvm_atomic_load_add_f32:
- case Intrinsic::nvvm_atomic_load_inc_32:
- case Intrinsic::nvvm_atomic_load_dec_32:
-
- case Intrinsic::nvvm_atomic_add_gen_f_cta:
- case Intrinsic::nvvm_atomic_add_gen_f_sys:
- case Intrinsic::nvvm_atomic_add_gen_i_cta:
- case Intrinsic::nvvm_atomic_add_gen_i_sys:
- case Intrinsic::nvvm_atomic_and_gen_i_cta:
- case Intrinsic::nvvm_atomic_and_gen_i_sys:
- case Intrinsic::nvvm_atomic_cas_gen_i_cta:
- case Intrinsic::nvvm_atomic_cas_gen_i_sys:
- case Intrinsic::nvvm_atomic_dec_gen_i_cta:
- case Intrinsic::nvvm_atomic_dec_gen_i_sys:
- case Intrinsic::nvvm_atomic_inc_gen_i_cta:
- case Intrinsic::nvvm_atomic_inc_gen_i_sys:
- case Intrinsic::nvvm_atomic_max_gen_i_cta:
- case Intrinsic::nvvm_atomic_max_gen_i_sys:
- case Intrinsic::nvvm_atomic_min_gen_i_cta:
- case Intrinsic::nvvm_atomic_min_gen_i_sys:
- case Intrinsic::nvvm_atomic_or_gen_i_cta:
- case Intrinsic::nvvm_atomic_or_gen_i_sys:
- case Intrinsic::nvvm_atomic_exch_gen_i_cta:
- case Intrinsic::nvvm_atomic_exch_gen_i_sys:
- case Intrinsic::nvvm_atomic_xor_gen_i_cta:
- case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
- auto &DL = I.getModule()->getDataLayout();
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = true;
- Info.align = 0;
- return true;
- }
-
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p: {
- auto &DL = I.getModule()->getDataLayout();
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
-
- return true;
- }
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p: {
- auto &DL = I.getModule()->getDataLayout();
-
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
-
- return true;
- }
-
- case Intrinsic::nvvm_tex_1d_v4f32_s32:
- case Intrinsic::nvvm_tex_1d_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_v4f32_s32:
- case Intrinsic::nvvm_tex_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_v4f32_s32:
- case Intrinsic::nvvm_tex_3d_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
- Info.opc = getOpcForTextureInstr(Intrinsic);
- Info.memVT = MVT::v4f32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
-
- case Intrinsic::nvvm_tex_1d_v4s32_s32:
- case Intrinsic::nvvm_tex_1d_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_v4s32_s32:
- case Intrinsic::nvvm_tex_2d_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_v4s32_s32:
- case Intrinsic::nvvm_tex_3d_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_cube_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
- case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_v4u32_s32:
- case Intrinsic::nvvm_tex_1d_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_v4u32_s32:
- case Intrinsic::nvvm_tex_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_v4u32_s32:
- case Intrinsic::nvvm_tex_3d_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
- case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
- case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
- case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
- case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
- Info.opc = getOpcForTextureInstr(Intrinsic);
- Info.memVT = MVT::v4i32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
-
- case Intrinsic::nvvm_suld_1d_i8_clamp:
- case Intrinsic::nvvm_suld_1d_v2i8_clamp:
- case Intrinsic::nvvm_suld_1d_v4i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
- case Intrinsic::nvvm_suld_2d_i8_clamp:
- case Intrinsic::nvvm_suld_2d_v2i8_clamp:
- case Intrinsic::nvvm_suld_2d_v4i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
- case Intrinsic::nvvm_suld_3d_i8_clamp:
- case Intrinsic::nvvm_suld_3d_v2i8_clamp:
- case Intrinsic::nvvm_suld_3d_v4i8_clamp:
- case Intrinsic::nvvm_suld_1d_i8_trap:
- case Intrinsic::nvvm_suld_1d_v2i8_trap:
- case Intrinsic::nvvm_suld_1d_v4i8_trap:
- case Intrinsic::nvvm_suld_1d_array_i8_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
- case Intrinsic::nvvm_suld_2d_i8_trap:
- case Intrinsic::nvvm_suld_2d_v2i8_trap:
- case Intrinsic::nvvm_suld_2d_v4i8_trap:
- case Intrinsic::nvvm_suld_2d_array_i8_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
- case Intrinsic::nvvm_suld_3d_i8_trap:
- case Intrinsic::nvvm_suld_3d_v2i8_trap:
- case Intrinsic::nvvm_suld_3d_v4i8_trap:
- case Intrinsic::nvvm_suld_1d_i8_zero:
- case Intrinsic::nvvm_suld_1d_v2i8_zero:
- case Intrinsic::nvvm_suld_1d_v4i8_zero:
- case Intrinsic::nvvm_suld_1d_array_i8_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
- case Intrinsic::nvvm_suld_2d_i8_zero:
- case Intrinsic::nvvm_suld_2d_v2i8_zero:
- case Intrinsic::nvvm_suld_2d_v4i8_zero:
- case Intrinsic::nvvm_suld_2d_array_i8_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
- case Intrinsic::nvvm_suld_3d_i8_zero:
- case Intrinsic::nvvm_suld_3d_v2i8_zero:
- case Intrinsic::nvvm_suld_3d_v4i8_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i8;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
-
- case Intrinsic::nvvm_suld_1d_i16_clamp:
- case Intrinsic::nvvm_suld_1d_v2i16_clamp:
- case Intrinsic::nvvm_suld_1d_v4i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
- case Intrinsic::nvvm_suld_2d_i16_clamp:
- case Intrinsic::nvvm_suld_2d_v2i16_clamp:
- case Intrinsic::nvvm_suld_2d_v4i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
- case Intrinsic::nvvm_suld_3d_i16_clamp:
- case Intrinsic::nvvm_suld_3d_v2i16_clamp:
- case Intrinsic::nvvm_suld_3d_v4i16_clamp:
- case Intrinsic::nvvm_suld_1d_i16_trap:
- case Intrinsic::nvvm_suld_1d_v2i16_trap:
- case Intrinsic::nvvm_suld_1d_v4i16_trap:
- case Intrinsic::nvvm_suld_1d_array_i16_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
- case Intrinsic::nvvm_suld_2d_i16_trap:
- case Intrinsic::nvvm_suld_2d_v2i16_trap:
- case Intrinsic::nvvm_suld_2d_v4i16_trap:
- case Intrinsic::nvvm_suld_2d_array_i16_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
- case Intrinsic::nvvm_suld_3d_i16_trap:
- case Intrinsic::nvvm_suld_3d_v2i16_trap:
- case Intrinsic::nvvm_suld_3d_v4i16_trap:
- case Intrinsic::nvvm_suld_1d_i16_zero:
- case Intrinsic::nvvm_suld_1d_v2i16_zero:
- case Intrinsic::nvvm_suld_1d_v4i16_zero:
- case Intrinsic::nvvm_suld_1d_array_i16_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
- case Intrinsic::nvvm_suld_2d_i16_zero:
- case Intrinsic::nvvm_suld_2d_v2i16_zero:
- case Intrinsic::nvvm_suld_2d_v4i16_zero:
- case Intrinsic::nvvm_suld_2d_array_i16_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
- case Intrinsic::nvvm_suld_3d_i16_zero:
- case Intrinsic::nvvm_suld_3d_v2i16_zero:
- case Intrinsic::nvvm_suld_3d_v4i16_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i16;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
-
- case Intrinsic::nvvm_suld_1d_i32_clamp:
- case Intrinsic::nvvm_suld_1d_v2i32_clamp:
- case Intrinsic::nvvm_suld_1d_v4i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
- case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
- case Intrinsic::nvvm_suld_2d_i32_clamp:
- case Intrinsic::nvvm_suld_2d_v2i32_clamp:
- case Intrinsic::nvvm_suld_2d_v4i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
- case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
- case Intrinsic::nvvm_suld_3d_i32_clamp:
- case Intrinsic::nvvm_suld_3d_v2i32_clamp:
- case Intrinsic::nvvm_suld_3d_v4i32_clamp:
- case Intrinsic::nvvm_suld_1d_i32_trap:
- case Intrinsic::nvvm_suld_1d_v2i32_trap:
- case Intrinsic::nvvm_suld_1d_v4i32_trap:
- case Intrinsic::nvvm_suld_1d_array_i32_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
- case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
- case Intrinsic::nvvm_suld_2d_i32_trap:
- case Intrinsic::nvvm_suld_2d_v2i32_trap:
- case Intrinsic::nvvm_suld_2d_v4i32_trap:
- case Intrinsic::nvvm_suld_2d_array_i32_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
- case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
- case Intrinsic::nvvm_suld_3d_i32_trap:
- case Intrinsic::nvvm_suld_3d_v2i32_trap:
- case Intrinsic::nvvm_suld_3d_v4i32_trap:
- case Intrinsic::nvvm_suld_1d_i32_zero:
- case Intrinsic::nvvm_suld_1d_v2i32_zero:
- case Intrinsic::nvvm_suld_1d_v4i32_zero:
- case Intrinsic::nvvm_suld_1d_array_i32_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
- case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
- case Intrinsic::nvvm_suld_2d_i32_zero:
- case Intrinsic::nvvm_suld_2d_v2i32_zero:
- case Intrinsic::nvvm_suld_2d_v4i32_zero:
- case Intrinsic::nvvm_suld_2d_array_i32_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
- case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
- case Intrinsic::nvvm_suld_3d_i32_zero:
- case Intrinsic::nvvm_suld_3d_v2i32_zero:
- case Intrinsic::nvvm_suld_3d_v4i32_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i32;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
-
- case Intrinsic::nvvm_suld_1d_i64_clamp:
- case Intrinsic::nvvm_suld_1d_v2i64_clamp:
- case Intrinsic::nvvm_suld_1d_array_i64_clamp:
- case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
- case Intrinsic::nvvm_suld_2d_i64_clamp:
- case Intrinsic::nvvm_suld_2d_v2i64_clamp:
- case Intrinsic::nvvm_suld_2d_array_i64_clamp:
- case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
- case Intrinsic::nvvm_suld_3d_i64_clamp:
- case Intrinsic::nvvm_suld_3d_v2i64_clamp:
- case Intrinsic::nvvm_suld_1d_i64_trap:
- case Intrinsic::nvvm_suld_1d_v2i64_trap:
- case Intrinsic::nvvm_suld_1d_array_i64_trap:
- case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
- case Intrinsic::nvvm_suld_2d_i64_trap:
- case Intrinsic::nvvm_suld_2d_v2i64_trap:
- case Intrinsic::nvvm_suld_2d_array_i64_trap:
- case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
- case Intrinsic::nvvm_suld_3d_i64_trap:
- case Intrinsic::nvvm_suld_3d_v2i64_trap:
- case Intrinsic::nvvm_suld_1d_i64_zero:
- case Intrinsic::nvvm_suld_1d_v2i64_zero:
- case Intrinsic::nvvm_suld_1d_array_i64_zero:
- case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
- case Intrinsic::nvvm_suld_2d_i64_zero:
- case Intrinsic::nvvm_suld_2d_v2i64_zero:
- case Intrinsic::nvvm_suld_2d_array_i64_zero:
- case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
- case Intrinsic::nvvm_suld_3d_i64_zero:
- case Intrinsic::nvvm_suld_3d_v2i64_zero:
- Info.opc = getOpcForSurfaceInstr(Intrinsic);
- Info.memVT = MVT::i64;
- Info.ptrVal = nullptr;
- Info.offset = 0;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
- Info.align = 16;
- return true;
- }
- return false;
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-/// Used to guide target specific optimizations, like loop strength reduction
-/// (LoopStrengthReduce.cpp) and memory optimization for address mode
-/// (CodeGenPrepare.cpp)
-bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
- const AddrMode &AM, Type *Ty,
- unsigned AS) const {
- // AddrMode - This represents an addressing mode of:
- // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
- //
- // The legal address modes are
- // - [avar]
- // - [areg]
- // - [areg+immoff]
- // - [immAddr]
-
- if (AM.BaseGV) {
- return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
- }
-
- switch (AM.Scale) {
- case 0: // "r", "r+i" or "i" is allowed
- break;
- case 1:
- if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
- return false;
- // Otherwise we have r+i.
- break;
- default:
- // No scale > 1 is allowed
- return false;
- }
- return true;
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-NVPTXTargetLowering::ConstraintType
-NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- default:
- break;
- case 'b':
- case 'r':
- case 'h':
- case 'c':
- case 'l':
- case 'f':
- case 'd':
- case '0':
- case 'N':
- return C_RegisterClass;
- }
- }
- return TargetLowering::getConstraintType(Constraint);
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
- StringRef Constraint,
- MVT VT) const {
- if (Constraint.size() == 1) {
- switch (Constraint[0]) {
- case 'b':
- return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
- case 'c':
- return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
- case 'h':
- return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
- case 'r':
- return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
- case 'l':
- case 'N':
- return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
- case 'f':
- return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
- case 'd':
- return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
- }
- }
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX DAG Combining
-//===----------------------------------------------------------------------===//
-
-bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
- CodeGenOpt::Level OptLevel) const {
- // Always honor command-line argument
- if (FMAContractLevelOpt.getNumOccurrences() > 0)
- return FMAContractLevelOpt > 0;
-
- // Do not contract if we're not optimizing the code.
- if (OptLevel == 0)
- return false;
-
- // Honor TargetOptions flags that explicitly say fusion is okay.
- if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
- return true;
-
- return allowUnsafeFPMath(MF);
-}
-
-bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
- // Honor TargetOptions flags that explicitly say unsafe math is okay.
- if (MF.getTarget().Options.UnsafeFPMath)
- return true;
-
- // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
- const Function *F = MF.getFunction();
- if (F->hasFnAttribute("unsafe-fp-math")) {
- Attribute Attr = F->getFnAttribute("unsafe-fp-math");
- StringRef Val = Attr.getValueAsString();
- if (Val == "true")
- return true;
- }
-
- return false;
-}
-
-/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
-/// operands N0 and N1. This is a helper for PerformADDCombine that is
-/// called with the default operands, and if that fails, with commuted
-/// operands.
-static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
- TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget,
- CodeGenOpt::Level OptLevel) {
- SelectionDAG &DAG = DCI.DAG;
- // Skip non-integer, non-scalar case
- EVT VT=N0.getValueType();
- if (VT.isVector())
- return SDValue();
-
- // fold (add (mul a, b), c) -> (mad a, b, c)
- //
- if (N0.getOpcode() == ISD::MUL) {
- assert (VT.isInteger());
- // For integer:
- // Since integer multiply-add costs the same as integer multiply
- // but is more costly than integer add, do the fusion only when
- // the mul is only used in the add.
- if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
- !N0.getNode()->hasOneUse())
- return SDValue();
-
- // Do the folding
- return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
- }
- else if (N0.getOpcode() == ISD::FMUL) {
- if (VT == MVT::f32 || VT == MVT::f64) {
- const auto *TLI = static_cast<const NVPTXTargetLowering *>(
- &DAG.getTargetLoweringInfo());
- if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
- return SDValue();
-
- // For floating point:
- // Do the fusion only when the mul has less than 5 uses and all
- // are add.
- // The heuristic is that if a use is not an add, then that use
- // cannot be fused into fma, therefore mul is still needed anyway.
- // If there are more than 4 uses, even if they are all add, fusing
- // them will increase register pressue.
- //
- int numUses = 0;
- int nonAddCount = 0;
- for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
- UE = N0.getNode()->use_end();
- UI != UE; ++UI) {
- numUses++;
- SDNode *User = *UI;
- if (User->getOpcode() != ISD::FADD)
- ++nonAddCount;
- }
- if (numUses >= 5)
- return SDValue();
- if (nonAddCount) {
- int orderNo = N->getIROrder();
- int orderNo2 = N0.getNode()->getIROrder();
- // simple heuristics here for considering potential register
- // pressure, the logics here is that the differnce are used
- // to measure the distance between def and use, the longer distance
- // more likely cause register pressure.
- if (orderNo - orderNo2 < 500)
- return SDValue();
-
- // Now, check if at least one of the FMUL's operands is live beyond the node N,
- // which guarantees that the FMA will not increase register pressure at node N.
- bool opIsLive = false;
- const SDNode *left = N0.getOperand(0).getNode();
- const SDNode *right = N0.getOperand(1).getNode();
-
- if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
- opIsLive = true;
-
- if (!opIsLive)
- for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
- }
-
- if (!opIsLive)
- for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
- int orderNo3 = User->getIROrder();
- if (orderNo3 > orderNo) {
- opIsLive = true;
- break;
- }
- }
-
- if (!opIsLive)
- return SDValue();
- }
-
- return DAG.getNode(ISD::FMA, SDLoc(N), VT,
- N0.getOperand(0), N0.getOperand(1), N1);
- }
- }
-
- return SDValue();
-}
-
-/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
-///
-static SDValue PerformADDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const NVPTXSubtarget &Subtarget,
- CodeGenOpt::Level OptLevel) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- // First try with the default operand order.
- if (SDValue Result =
- PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
- return Result;
-
- // If that didn't work, try again with the operands commuted.
- return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
-}
-
-static SDValue PerformANDCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- // The type legalizer turns a vector load of i8 values into a zextload to i16
- // registers, optionally ANY_EXTENDs it (if target type is integer),
- // and ANDs off the high 8 bits. Since we turn this load into a
- // target-specific DAG node, the DAG combiner fails to eliminate these AND
- // nodes. Do that here.
- SDValue Val = N->getOperand(0);
- SDValue Mask = N->getOperand(1);
-
- if (isa<ConstantSDNode>(Val)) {
- std::swap(Val, Mask);
- }
-
- SDValue AExt;
- // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
- if (Val.getOpcode() == ISD::ANY_EXTEND) {
- AExt = Val;
- Val = Val->getOperand(0);
- }
-
- if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
- Val = Val->getOperand(0);
- }
-
- if (Val->getOpcode() == NVPTXISD::LoadV2 ||
- Val->getOpcode() == NVPTXISD::LoadV4) {
- ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
- if (!MaskCnst) {
- // Not an AND with a constant
- return SDValue();
- }
-
- uint64_t MaskVal = MaskCnst->getZExtValue();
- if (MaskVal != 0xff) {
- // Not an AND that chops off top 8 bits
- return SDValue();
- }
-
- MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
- if (!Mem) {
- // Not a MemSDNode?!?
- return SDValue();
- }
-
- EVT MemVT = Mem->getMemoryVT();
- if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
- // We only handle the i8 case
- return SDValue();
- }
-
- unsigned ExtType =
- cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
- getZExtValue();
- if (ExtType == ISD::SEXTLOAD) {
- // If for some reason the load is a sextload, the and is needed to zero
- // out the high 8 bits
- return SDValue();
- }
-
- bool AddTo = false;
- if (AExt.getNode() != nullptr) {
- // Re-insert the ext as a zext.
- Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
- AExt.getValueType(), Val);
- AddTo = true;
- }
-
- // If we get here, the AND is unnecessary. Just replace it with the load
- DCI.CombineTo(N, Val, AddTo);
- }
-
- return SDValue();
-}
-
-static SDValue PerformREMCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
-
- // Don't do anything at less than -O2.
- if (OptLevel < CodeGenOpt::Default)
- return SDValue();
-
- SelectionDAG &DAG = DCI.DAG;
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
- bool IsSigned = N->getOpcode() == ISD::SREM;
- unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
-
- const SDValue &Num = N->getOperand(0);
- const SDValue &Den = N->getOperand(1);
-
- for (const SDNode *U : Num->uses()) {
- if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
- U->getOperand(1) == Den) {
- // Num % Den -> Num - (Num / Den) * Den
- return DAG.getNode(ISD::SUB, DL, VT, Num,
- DAG.getNode(ISD::MUL, DL, VT,
- DAG.getNode(DivOpc, DL, VT, Num, Den),
- Den));
- }
- }
- return SDValue();
-}
-
-enum OperandSignedness {
- Signed = 0,
- Unsigned,
- Unknown
-};
-
-/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
-/// that can be demoted to \p OptSize bits without loss of information. The
-/// signedness of the operand, if determinable, is placed in \p S.
-static bool IsMulWideOperandDemotable(SDValue Op,
- unsigned OptSize,
- OperandSignedness &S) {
- S = Unknown;
-
- if (Op.getOpcode() == ISD::SIGN_EXTEND ||
- Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
- EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getSizeInBits() <= OptSize) {
- S = Signed;
- return true;
- }
- } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
- EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getSizeInBits() <= OptSize) {
- S = Unsigned;
- return true;
- }
- }
-
- return false;
-}
-
-/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
-/// be demoted to \p OptSize bits without loss of information. If the operands
-/// contain a constant, it should appear as the RHS operand. The signedness of
-/// the operands is placed in \p IsSigned.
-static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
- unsigned OptSize,
- bool &IsSigned) {
- OperandSignedness LHSSign;
-
- // The LHS operand must be a demotable op
- if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
- return false;
-
- // We should have been able to determine the signedness from the LHS
- if (LHSSign == Unknown)
- return false;
-
- IsSigned = (LHSSign == Signed);
-
- // The RHS can be a demotable op or a constant
- if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
- const APInt &Val = CI->getAPIntValue();
- if (LHSSign == Unsigned) {
- return Val.isIntN(OptSize);
- } else {
- return Val.isSignedIntN(OptSize);
- }
- } else {
- OperandSignedness RHSSign;
- if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
- return false;
-
- return LHSSign == RHSSign;
- }
-}
-
-/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
-/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
-/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
-/// amount.
-static SDValue TryMULWIDECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- EVT MulType = N->getValueType(0);
- if (MulType != MVT::i32 && MulType != MVT::i64) {
- return SDValue();
- }
-
- SDLoc DL(N);
- unsigned OptSize = MulType.getSizeInBits() >> 1;
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
- // Canonicalize the multiply so the constant (if any) is on the right
- if (N->getOpcode() == ISD::MUL) {
- if (isa<ConstantSDNode>(LHS)) {
- std::swap(LHS, RHS);
- }
- }
-
- // If we have a SHL, determine the actual multiply amount
- if (N->getOpcode() == ISD::SHL) {
- ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
- if (!ShlRHS) {
- return SDValue();
- }
-
- APInt ShiftAmt = ShlRHS->getAPIntValue();
- unsigned BitWidth = MulType.getSizeInBits();
- if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
- APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
- RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
- } else {
- return SDValue();
- }
- }
-
- bool Signed;
- // Verify that our operands are demotable
- if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
- return SDValue();
- }
-
- EVT DemotedVT;
- if (MulType == MVT::i32) {
- DemotedVT = MVT::i16;
- } else {
- DemotedVT = MVT::i32;
- }
-
- // Truncate the operands to the correct size. Note that these are just for
- // type consistency and will (likely) be eliminated in later phases.
- SDValue TruncLHS =
- DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
- SDValue TruncRHS =
- DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
-
- unsigned Opc;
- if (Signed) {
- Opc = NVPTXISD::MUL_WIDE_SIGNED;
- } else {
- Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
- }
-
- return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
-}
-
-/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
-static SDValue PerformMULCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- if (OptLevel > 0) {
- // Try mul.wide combining at OptLevel > 0
- if (SDValue Ret = TryMULWIDECombine(N, DCI))
- return Ret;
- }
-
- return SDValue();
-}
-
-/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
-static SDValue PerformSHLCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- CodeGenOpt::Level OptLevel) {
- if (OptLevel > 0) {
- // Try mul.wide combining at OptLevel > 0
- if (SDValue Ret = TryMULWIDECombine(N, DCI))
- return Ret;
- }
-
- return SDValue();
-}
-
-static SDValue PerformSETCCCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- EVT CCType = N->getValueType(0);
- SDValue A = N->getOperand(0);
- SDValue B = N->getOperand(1);
-
- if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
- return SDValue();
-
- SDLoc DL(N);
- // setp.f16x2 returns two scalar predicates, which we need to
- // convert back to v2i1. The returned result will be scalarized by
- // the legalizer, but the comparison will remain a single vector
- // instruction.
- SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
- DCI.DAG.getVTList(MVT::i1, MVT::i1),
- {A, B, N->getOperand(2)});
- return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
- CCNode.getValue(1));
-}
-
-SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
- switch (N->getOpcode()) {
- default: break;
- case ISD::ADD:
- case ISD::FADD:
- return PerformADDCombine(N, DCI, STI, OptLevel);
- case ISD::MUL:
- return PerformMULCombine(N, DCI, OptLevel);
- case ISD::SHL:
- return PerformSHLCombine(N, DCI, OptLevel);
- case ISD::AND:
- return PerformANDCombine(N, DCI);
- case ISD::UREM:
- case ISD::SREM:
- return PerformREMCombine(N, DCI, OptLevel);
- case ISD::SETCC:
- return PerformSETCCCombine(N, DCI);
- }
- return SDValue();
-}
-
-/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results) {
- EVT ResVT = N->getValueType(0);
- SDLoc DL(N);
-
- assert(ResVT.isVector() && "Vector load must have vector type");
-
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 loads of <2 x double> here
- // but I'm leaving that as a TODO for now.
- assert(ResVT.isSimple() && "Can only handle simple types");
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
- return;
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
- // This is a "native" vector type
- break;
- }
-
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
- unsigned Align = LD->getAlignment();
- auto &TD = DAG.getDataLayout();
- unsigned PrefAlign =
- TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
- if (Align < PrefAlign) {
- // This load is not sufficiently aligned, so bail out and let this vector
- // load be scalarized. Note that we may still be able to emit smaller
- // vector loads. For example, if we are loading a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return;
- }
-
- EVT EltVT = ResVT.getVectorElementType();
- unsigned NumElts = ResVT.getVectorNumElements();
-
- // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- bool NeedTrunc = false;
- if (EltVT.getSizeInBits() < 16) {
- EltVT = MVT::i16;
- NeedTrunc = true;
- }
-
- unsigned Opcode = 0;
- SDVTList LdResVTs;
- bool LoadF16x2 = false;
-
- switch (NumElts) {
- default:
- return;
- case 2:
- Opcode = NVPTXISD::LoadV2;
- LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
- break;
- case 4: {
- Opcode = NVPTXISD::LoadV4;
- EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- case 8: {
- // v8f16 is a special case. PTX doesn't have ld.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // load them with ld.v4.b32.
- assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
- LoadF16x2 = true;
- Opcode = NVPTXISD::LoadV4;
- EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
- MVT::Other};
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- }
-
- // Copy regular operands
- SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
-
- // The select routine does not have access to the LoadSDNode instance, so
- // pass along the extension information
- OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- LD->getMemoryVT(),
- LD->getMemOperand());
-
- SmallVector<SDValue, 8> ScalarRes;
- if (LoadF16x2) {
- // Split v2f16 subvectors back into individual elements.
- NumElts /= 2;
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue SubVector = NewLD.getValue(i);
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(0, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(1, DL));
- ScalarRes.push_back(E0);
- ScalarRes.push_back(E1);
- }
- } else {
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
- }
- }
-
- SDValue LoadChain = NewLD.getValue(NumElts);
-
- SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
-
- Results.push_back(BuildVec);
- Results.push_back(LoadChain);
-}
-
-static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results) {
- SDValue Chain = N->getOperand(0);
- SDValue Intrin = N->getOperand(1);
- SDLoc DL(N);
-
- // Get the intrinsic ID
- unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p: {
- EVT ResVT = N->getValueType(0);
-
- if (ResVT.isVector()) {
- // Vector LDG/LDU
-
- unsigned NumElts = ResVT.getVectorNumElements();
- EVT EltVT = ResVT.getVectorElementType();
-
- // Since LDU/LDG are target nodes, we cannot rely on DAG type
- // legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- bool NeedTrunc = false;
- if (EltVT.getSizeInBits() < 16) {
- EltVT = MVT::i16;
- NeedTrunc = true;
- }
-
- unsigned Opcode = 0;
- SDVTList LdResVTs;
-
- switch (NumElts) {
- default:
- return;
- case 2:
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- Opcode = NVPTXISD::LDGV2;
- break;
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p:
- Opcode = NVPTXISD::LDUV2;
- break;
- }
- LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
- break;
- case 4: {
- switch (IntrinNo) {
- default:
- return;
- case Intrinsic::nvvm_ldg_global_i:
- case Intrinsic::nvvm_ldg_global_f:
- case Intrinsic::nvvm_ldg_global_p:
- Opcode = NVPTXISD::LDGV4;
- break;
- case Intrinsic::nvvm_ldu_global_i:
- case Intrinsic::nvvm_ldu_global_f:
- case Intrinsic::nvvm_ldu_global_p:
- Opcode = NVPTXISD::LDUV4;
- break;
- }
- EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- }
-
- SmallVector<SDValue, 8> OtherOps;
-
- // Copy regular operands
-
- OtherOps.push_back(Chain); // Chain
- // Skip operand 1 (intrinsic ID)
- // Others
- OtherOps.append(N->op_begin() + 2, N->op_end());
-
- MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
-
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- MemSD->getMemoryVT(),
- MemSD->getMemOperand());
-
- SmallVector<SDValue, 4> ScalarRes;
-
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res =
- DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
- }
-
- SDValue LoadChain = NewLD.getValue(NumElts);
-
- SDValue BuildVec =
- DAG.getBuildVector(ResVT, DL, ScalarRes);
-
- Results.push_back(BuildVec);
- Results.push_back(LoadChain);
- } else {
- // i8 LDG/LDU
- assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
- "Custom handling of non-i8 ldu/ldg?");
-
- // Just copy all operands as-is
- SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
-
- // Force output to i16
- SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
-
- MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
-
- // We make sure the memory type is i8, which will be used during isel
- // to select the proper instruction.
- SDValue NewLD =
- DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
- MVT::i8, MemSD->getMemOperand());
-
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
- NewLD.getValue(0)));
- Results.push_back(NewLD.getValue(1));
- }
- }
- }
-}
-
-void NVPTXTargetLowering::ReplaceNodeResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- switch (N->getOpcode()) {
- default:
- report_fatal_error("Unhandled custom legalization");
- case ISD::LOAD:
- ReplaceLoadVector(N, DAG, Results);
- return;
- case ISD::INTRINSIC_W_CHAIN:
- ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
- return;
- }
-}
-
-// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
-void NVPTXSection::anchor() {}
-
-NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
- delete static_cast<NVPTXSection *>(TextSection);
- delete static_cast<NVPTXSection *>(DataSection);
- delete static_cast<NVPTXSection *>(BSSSection);
- delete static_cast<NVPTXSection *>(ReadOnlySection);
-
- delete static_cast<NVPTXSection *>(StaticCtorSection);
- delete static_cast<NVPTXSection *>(StaticDtorSection);
- delete static_cast<NVPTXSection *>(LSDASection);
- delete static_cast<NVPTXSection *>(EHFrameSection);
- delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
- delete static_cast<NVPTXSection *>(DwarfInfoSection);
- delete static_cast<NVPTXSection *>(DwarfLineSection);
- delete static_cast<NVPTXSection *>(DwarfFrameSection);
- delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
- delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
- delete static_cast<NVPTXSection *>(DwarfStrSection);
- delete static_cast<NVPTXSection *>(DwarfLocSection);
- delete static_cast<NVPTXSection *>(DwarfARangesSection);
- delete static_cast<NVPTXSection *>(DwarfRangesSection);
- delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
-}
-
-MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
- const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
- return getDataSection();
-}
+//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that NVPTX uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXISelLowering.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include "NVPTX.h" +#include "NVPTXSection.h" +#include "NVPTXSubtarget.h" +#include "NVPTXTargetMachine.h" +#include "NVPTXTargetObjectFile.h" +#include "NVPTXUtilities.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetCallingConv.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <sstream> +#include <string> +#include <utility> +#include <vector> + +#define DEBUG_TYPE "nvptx-lower" + +using namespace llvm; + +static unsigned int uniqueCallSite = 0; + +static cl::opt<bool> sched4reg( + "nvptx-sched4reg", + cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); + +static cl::opt<unsigned> +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); + +static cl::opt<int> UsePrecDivF32( + "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" + " IEEE Compliant F32 div.rnd if available."), + cl::init(2)); + +static cl::opt<bool> UsePrecSqrtF32( + "nvptx-prec-sqrtf32", cl::Hidden, + cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), + cl::init(true)); + +static cl::opt<bool> FtzEnabled( + "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), + cl::init(false)); + +int NVPTXTargetLowering::getDivF32Level() const { + if (UsePrecDivF32.getNumOccurrences() > 0) { + // If nvptx-prec-div32=N is used on the command-line, always honor it + return UsePrecDivF32; + } else { + // Otherwise, use div.approx if fast math is enabled + if (getTargetMachine().Options.UnsafeFPMath) + return 0; + else + return 2; + } +} + +bool NVPTXTargetLowering::usePrecSqrtF32() const { + if (UsePrecSqrtF32.getNumOccurrences() > 0) { + // If nvptx-prec-sqrtf32 is used on the command-line, always honor it + return UsePrecSqrtF32; + } else { + // Otherwise, use sqrt.approx if fast math is enabled + return !getTargetMachine().Options.UnsafeFPMath; + } +} + +bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { + // TODO: Get rid of this flag; there can be only one way to do this. + if (FtzEnabled.getNumOccurrences() > 0) { + // If nvptx-f32ftz is used on the command-line, always honor it + return FtzEnabled; + } else { + const Function &F = MF.getFunction(); + // Otherwise, check for an nvptx-f32ftz attribute on the function + if (F.hasFnAttribute("nvptx-f32ftz")) + return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; + else + return false; + } +} + +static bool IsPTXVectorType(MVT VT) { + switch (VT.SimpleTy) { + default: + return false; + case MVT::v2i1: + case MVT::v4i1: + case MVT::v2i8: + case MVT::v4i8: + case MVT::v2i16: + case MVT::v4i16: + case MVT::v2i32: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v4f16: + case MVT::v8f16: // <4 x f16x2> + case MVT::v2f32: + case MVT::v4f32: + case MVT::v2f64: + return true; + } +} + +/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive +/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors +/// into their primitive components. +/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the +/// same number of types as the Ins/Outs arrays in LowerFormalArguments, +/// LowerCall, and LowerReturn. +static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<uint64_t> *Offsets = nullptr, + uint64_t StartingOffset = 0) { + SmallVector<EVT, 16> TempVTs; + SmallVector<uint64_t, 16> TempOffsets; + + // Special case for i128 - decompose to (i64, i64) + if (Ty->isIntegerTy(128)) { + ValueVTs.push_back(EVT(MVT::i64)); + ValueVTs.push_back(EVT(MVT::i64)); + + if (Offsets) { + Offsets->push_back(StartingOffset + 0); + Offsets->push_back(StartingOffset + 8); + } + + return; + } + + ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); + for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { + EVT VT = TempVTs[i]; + uint64_t Off = TempOffsets[i]; + // Split vectors into individual elements, except for v2f16, which + // we will pass as a single scalar. + if (VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + // Vectors with an even number of f16 elements will be passed to + // us as an array of v2f16 elements. We must match this so we + // stay in sync with Ins/Outs. + if (EltVT == MVT::f16 && NumElts % 2 == 0) { + EltVT = MVT::v2f16; + NumElts /= 2; + } + for (unsigned j = 0; j != NumElts; ++j) { + ValueVTs.push_back(EltVT); + if (Offsets) + Offsets->push_back(Off + j * EltVT.getStoreSize()); + } + } else { + ValueVTs.push_back(VT); + if (Offsets) + Offsets->push_back(Off); + } + } +} + +// Check whether we can merge loads/stores of some of the pieces of a +// flattened function parameter or return value into a single vector +// load/store. +// +// The flattened parameter is represented as a list of EVTs and +// offsets, and the whole structure is aligned to ParamAlignment. This +// function determines whether we can load/store pieces of the +// parameter starting at index Idx using a single vectorized op of +// size AccessSize. If so, it returns the number of param pieces +// covered by the vector op. Otherwise, it returns 1. +static unsigned CanMergeParamLoadStoresStartingAt( + unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, + const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) { + assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); + + // Can't vectorize if param alignment is not sufficient. + if (AccessSize > ParamAlignment) + return 1; + // Can't vectorize if offset is not aligned. + if (Offsets[Idx] & (AccessSize - 1)) + return 1; + + EVT EltVT = ValueVTs[Idx]; + unsigned EltSize = EltVT.getStoreSize(); + + // Element is too large to vectorize. + if (EltSize >= AccessSize) + return 1; + + unsigned NumElts = AccessSize / EltSize; + // Can't vectorize if AccessBytes if not a multiple of EltSize. + if (AccessSize != EltSize * NumElts) + return 1; + + // We don't have enough elements to vectorize. + if (Idx + NumElts > ValueVTs.size()) + return 1; + + // PTX ISA can only deal with 2- and 4-element vector ops. + if (NumElts != 4 && NumElts != 2) + return 1; + + for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { + // Types do not match. + if (ValueVTs[j] != EltVT) + return 1; + + // Elements are not contiguous. + if (Offsets[j] - Offsets[j - 1] != EltSize) + return 1; + } + // OK. We can vectorize ValueVTs[i..i+NumElts) + return NumElts; +} + +// Flags for tracking per-element vectorization state of loads/stores +// of a flattened function parameter or return value. +enum ParamVectorizationFlags { + PVF_INNER = 0x0, // Middle elements of a vector. + PVF_FIRST = 0x1, // First element of the vector. + PVF_LAST = 0x2, // Last element of the vector. + // Scalar is effectively a 1-element vector. + PVF_SCALAR = PVF_FIRST | PVF_LAST +}; + +// Computes whether and how we can vectorize the loads/stores of a +// flattened function parameter or return value. +// +// The flattened parameter is represented as the list of ValueVTs and +// Offsets, and is aligned to ParamAlignment bytes. We return a vector +// of the same size as ValueVTs indicating how each piece should be +// loaded/stored (i.e. as a scalar, or as part of a vector +// load/store). +static SmallVector<ParamVectorizationFlags, 16> +VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, + const SmallVectorImpl<uint64_t> &Offsets, + unsigned ParamAlignment) { + // Set vector size to match ValueVTs and mark all elements as + // scalars by default. + SmallVector<ParamVectorizationFlags, 16> VectorInfo; + VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); + + // Check what we can vectorize using 128/64/32-bit accesses. + for (int I = 0, E = ValueVTs.size(); I != E; ++I) { + // Skip elements we've already processed. + assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); + for (unsigned AccessSize : {16, 8, 4, 2}) { + unsigned NumElts = CanMergeParamLoadStoresStartingAt( + I, AccessSize, ValueVTs, Offsets, ParamAlignment); + // Mark vectorized elements. + switch (NumElts) { + default: + llvm_unreachable("Unexpected return value"); + case 1: + // Can't vectorize using this size, try next smaller size. + continue; + case 2: + assert(I + 1 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_LAST; + I += 1; + break; + case 4: + assert(I + 3 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_INNER; + VectorInfo[I + 2] = PVF_INNER; + VectorInfo[I + 3] = PVF_LAST; + I += 3; + break; + } + // Break out of the inner loop because we've already succeeded + // using largest possible AccessSize. + break; + } + } + return VectorInfo; +} + +// NVPTXTargetLowering Constructor. +NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, + const NVPTXSubtarget &STI) + : TargetLowering(TM), nvTM(&TM), STI(STI) { + // always lower memset, memcpy, and memmove intrinsics to load/store + // instructions, rather + // then generating calls to memset, mempcy or memmove. + MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // Jump is Expensive. Don't create extra control flow for 'and', 'or' + // condition branches. + setJumpIsExpensive(true); + + // Wide divides are _very_ slow. Try to reduce the width of the divide if + // possible. + addBypassSlowDiv(64, 32); + + // By default, use the Source scheduling + if (sched4reg) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Source); + + auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, + LegalizeAction NoF16Action) { + setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); + }; + + addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); + addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); + addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); + addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); + addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); + addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); + addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); + addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); + + // Conversion to/from FP16/FP16x2 is always legal. + setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); + setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); + + // Operations not directly supported by NVPTX. + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v2f16, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i8, Expand); + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + // Some SIGN_EXTEND_INREG can be done using cvt instruction. + // For others we will expand to a SHL/SRA pair. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + + if (STI.hasROT64()) { + setOperationAction(ISD::ROTL, MVT::i64, Legal); + setOperationAction(ISD::ROTR, MVT::i64, Legal); + } else { + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + } + if (STI.hasROT32()) { + setOperationAction(ISD::ROTL, MVT::i32, Legal); + setOperationAction(ISD::ROTR, MVT::i32, Legal); + } else { + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + } + + setOperationAction(ISD::ROTL, MVT::i16, Expand); + setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::i8, Expand); + setOperationAction(ISD::ROTR, MVT::i8, Expand); + setOperationAction(ISD::BSWAP, MVT::i16, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + + // Indirect branch is not supported. + // This also disables Jump Table creation. + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + + // We want to legalize constant related memmove and memcopy + // intrinsics. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + + // Turn FP extload into load/fpextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); + // Turn FP truncstore into trunc + store. + // FIXME: vector types should also be expanded + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // PTX does not support load / store predicate registers + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::i1, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setTruncStoreAction(VT, MVT::i1, Expand); + } + + // This is legal in NVPTX + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f16, Legal); + + // TRAP can be lowered to PTX trap + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + + // Register custom handling for vector loads/stores + for (MVT VT : MVT::vector_valuetypes()) { + if (IsPTXVectorType(VT)) { + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); + } + } + + // Custom handling for i8 intrinsics + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { + setOperationAction(ISD::ABS, Ty, Legal); + setOperationAction(ISD::SMIN, Ty, Legal); + setOperationAction(ISD::SMAX, Ty, Legal); + setOperationAction(ISD::UMIN, Ty, Legal); + setOperationAction(ISD::UMAX, Ty, Legal); + + setOperationAction(ISD::CTPOP, Ty, Legal); + setOperationAction(ISD::CTLZ, Ty, Legal); + } + + setOperationAction(ISD::CTTZ, MVT::i16, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i64, Expand); + + // PTX does not directly support SELP of i1, so promote to i32 first + setOperationAction(ISD::SELECT, MVT::i1, Custom); + + // PTX cannot multiply two i64s in a single instruction. + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SREM); + setTargetDAGCombine(ISD::UREM); + + // setcc for f16x2 needs special handling to prevent legalizer's + // attempt to scalarize it due to v2i1 not being legal. + if (STI.allowFP16Math()) + setTargetDAGCombine(ISD::SETCC); + + // Promote fp16 arithmetic if fp16 hardware isn't available or the + // user passed --nvptx-no-fp16-math. The flag is useful because, + // although sm_53+ GPUs have some sort of FP16 support in + // hardware, only sm_53 and sm_60 have full implementation. Others + // only have token amount of hardware and are likely to run faster + // by using fp32 units instead. + for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { + setFP16OperationAction(Op, MVT::f16, Legal, Promote); + setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); + } + + // There's no neg.f16 instruction. Expand to (0-x). + setOperationAction(ISD::FNEG, MVT::f16, Expand); + setOperationAction(ISD::FNEG, MVT::v2f16, Expand); + + // (would be) Library functions. + + // These map to conversion instructions for scalar FP types. + for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, + ISD::FROUND, ISD::FTRUNC}) { + setOperationAction(Op, MVT::f16, Legal); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } + + // 'Expand' implements FCOPYSIGN without calling an external library. + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + + // These map to corresponding instructions for f32/f64. f16 must be + // promoted to f32. v2f16 is expanded to f16, which is then promoted + // to f32. + for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, + ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { + setOperationAction(Op, MVT::f16, Promote); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); + + // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. + // No FPOW or FREM in PTX. + + // Now deduce the information based on the above mentioned + // actions + computeRegisterProperties(STI.getRegisterInfo()); +} + +const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((NVPTXISD::NodeType)Opcode) { + case NVPTXISD::FIRST_NUMBER: + break; + case NVPTXISD::CALL: + return "NVPTXISD::CALL"; + case NVPTXISD::RET_FLAG: + return "NVPTXISD::RET_FLAG"; + case NVPTXISD::LOAD_PARAM: + return "NVPTXISD::LOAD_PARAM"; + case NVPTXISD::Wrapper: + return "NVPTXISD::Wrapper"; + case NVPTXISD::DeclareParam: + return "NVPTXISD::DeclareParam"; + case NVPTXISD::DeclareScalarParam: + return "NVPTXISD::DeclareScalarParam"; + case NVPTXISD::DeclareRet: + return "NVPTXISD::DeclareRet"; + case NVPTXISD::DeclareScalarRet: + return "NVPTXISD::DeclareScalarRet"; + case NVPTXISD::DeclareRetParam: + return "NVPTXISD::DeclareRetParam"; + case NVPTXISD::PrintCall: + return "NVPTXISD::PrintCall"; + case NVPTXISD::PrintConvergentCall: + return "NVPTXISD::PrintConvergentCall"; + case NVPTXISD::PrintCallUni: + return "NVPTXISD::PrintCallUni"; + case NVPTXISD::PrintConvergentCallUni: + return "NVPTXISD::PrintConvergentCallUni"; + case NVPTXISD::LoadParam: + return "NVPTXISD::LoadParam"; + case NVPTXISD::LoadParamV2: + return "NVPTXISD::LoadParamV2"; + case NVPTXISD::LoadParamV4: + return "NVPTXISD::LoadParamV4"; + case NVPTXISD::StoreParam: + return "NVPTXISD::StoreParam"; + case NVPTXISD::StoreParamV2: + return "NVPTXISD::StoreParamV2"; + case NVPTXISD::StoreParamV4: + return "NVPTXISD::StoreParamV4"; + case NVPTXISD::StoreParamS32: + return "NVPTXISD::StoreParamS32"; + case NVPTXISD::StoreParamU32: + return "NVPTXISD::StoreParamU32"; + case NVPTXISD::CallArgBegin: + return "NVPTXISD::CallArgBegin"; + case NVPTXISD::CallArg: + return "NVPTXISD::CallArg"; + case NVPTXISD::LastCallArg: + return "NVPTXISD::LastCallArg"; + case NVPTXISD::CallArgEnd: + return "NVPTXISD::CallArgEnd"; + case NVPTXISD::CallVoid: + return "NVPTXISD::CallVoid"; + case NVPTXISD::CallVal: + return "NVPTXISD::CallVal"; + case NVPTXISD::CallSymbol: + return "NVPTXISD::CallSymbol"; + case NVPTXISD::Prototype: + return "NVPTXISD::Prototype"; + case NVPTXISD::MoveParam: + return "NVPTXISD::MoveParam"; + case NVPTXISD::StoreRetval: + return "NVPTXISD::StoreRetval"; + case NVPTXISD::StoreRetvalV2: + return "NVPTXISD::StoreRetvalV2"; + case NVPTXISD::StoreRetvalV4: + return "NVPTXISD::StoreRetvalV4"; + case NVPTXISD::PseudoUseParam: + return "NVPTXISD::PseudoUseParam"; + case NVPTXISD::RETURN: + return "NVPTXISD::RETURN"; + case NVPTXISD::CallSeqBegin: + return "NVPTXISD::CallSeqBegin"; + case NVPTXISD::CallSeqEnd: + return "NVPTXISD::CallSeqEnd"; + case NVPTXISD::CallPrototype: + return "NVPTXISD::CallPrototype"; + case NVPTXISD::LoadV2: + return "NVPTXISD::LoadV2"; + case NVPTXISD::LoadV4: + return "NVPTXISD::LoadV4"; + case NVPTXISD::LDGV2: + return "NVPTXISD::LDGV2"; + case NVPTXISD::LDGV4: + return "NVPTXISD::LDGV4"; + case NVPTXISD::LDUV2: + return "NVPTXISD::LDUV2"; + case NVPTXISD::LDUV4: + return "NVPTXISD::LDUV4"; + case NVPTXISD::StoreV2: + return "NVPTXISD::StoreV2"; + case NVPTXISD::StoreV4: + return "NVPTXISD::StoreV4"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::SETP_F16X2: + return "NVPTXISD::SETP_F16X2"; + case NVPTXISD::Dummy: + return "NVPTXISD::Dummy"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; + case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; + case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; + case NVPTXISD::Tex1DFloatFloatLevel: + return "NVPTXISD::Tex1DFloatFloatLevel"; + case NVPTXISD::Tex1DFloatFloatGrad: + return "NVPTXISD::Tex1DFloatFloatGrad"; + case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; + case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; + case NVPTXISD::Tex1DS32FloatLevel: + return "NVPTXISD::Tex1DS32FloatLevel"; + case NVPTXISD::Tex1DS32FloatGrad: + return "NVPTXISD::Tex1DS32FloatGrad"; + case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; + case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; + case NVPTXISD::Tex1DU32FloatLevel: + return "NVPTXISD::Tex1DU32FloatLevel"; + case NVPTXISD::Tex1DU32FloatGrad: + return "NVPTXISD::Tex1DU32FloatGrad"; + case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; + case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; + case NVPTXISD::Tex1DArrayFloatFloatLevel: + return "NVPTXISD::Tex1DArrayFloatFloatLevel"; + case NVPTXISD::Tex1DArrayFloatFloatGrad: + return "NVPTXISD::Tex1DArrayFloatFloatGrad"; + case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; + case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; + case NVPTXISD::Tex1DArrayS32FloatLevel: + return "NVPTXISD::Tex1DArrayS32FloatLevel"; + case NVPTXISD::Tex1DArrayS32FloatGrad: + return "NVPTXISD::Tex1DArrayS32FloatGrad"; + case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; + case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; + case NVPTXISD::Tex1DArrayU32FloatLevel: + return "NVPTXISD::Tex1DArrayU32FloatLevel"; + case NVPTXISD::Tex1DArrayU32FloatGrad: + return "NVPTXISD::Tex1DArrayU32FloatGrad"; + case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; + case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; + case NVPTXISD::Tex2DFloatFloatLevel: + return "NVPTXISD::Tex2DFloatFloatLevel"; + case NVPTXISD::Tex2DFloatFloatGrad: + return "NVPTXISD::Tex2DFloatFloatGrad"; + case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; + case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; + case NVPTXISD::Tex2DS32FloatLevel: + return "NVPTXISD::Tex2DS32FloatLevel"; + case NVPTXISD::Tex2DS32FloatGrad: + return "NVPTXISD::Tex2DS32FloatGrad"; + case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; + case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; + case NVPTXISD::Tex2DU32FloatLevel: + return "NVPTXISD::Tex2DU32FloatLevel"; + case NVPTXISD::Tex2DU32FloatGrad: + return "NVPTXISD::Tex2DU32FloatGrad"; + case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; + case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; + case NVPTXISD::Tex2DArrayFloatFloatLevel: + return "NVPTXISD::Tex2DArrayFloatFloatLevel"; + case NVPTXISD::Tex2DArrayFloatFloatGrad: + return "NVPTXISD::Tex2DArrayFloatFloatGrad"; + case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; + case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; + case NVPTXISD::Tex2DArrayS32FloatLevel: + return "NVPTXISD::Tex2DArrayS32FloatLevel"; + case NVPTXISD::Tex2DArrayS32FloatGrad: + return "NVPTXISD::Tex2DArrayS32FloatGrad"; + case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; + case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; + case NVPTXISD::Tex2DArrayU32FloatLevel: + return "NVPTXISD::Tex2DArrayU32FloatLevel"; + case NVPTXISD::Tex2DArrayU32FloatGrad: + return "NVPTXISD::Tex2DArrayU32FloatGrad"; + case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; + case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; + case NVPTXISD::Tex3DFloatFloatLevel: + return "NVPTXISD::Tex3DFloatFloatLevel"; + case NVPTXISD::Tex3DFloatFloatGrad: + return "NVPTXISD::Tex3DFloatFloatGrad"; + case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; + case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; + case NVPTXISD::Tex3DS32FloatLevel: + return "NVPTXISD::Tex3DS32FloatLevel"; + case NVPTXISD::Tex3DS32FloatGrad: + return "NVPTXISD::Tex3DS32FloatGrad"; + case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; + case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; + case NVPTXISD::Tex3DU32FloatLevel: + return "NVPTXISD::Tex3DU32FloatLevel"; + case NVPTXISD::Tex3DU32FloatGrad: + return "NVPTXISD::Tex3DU32FloatGrad"; + case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; + case NVPTXISD::TexCubeFloatFloatLevel: + return "NVPTXISD::TexCubeFloatFloatLevel"; + case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; + case NVPTXISD::TexCubeS32FloatLevel: + return "NVPTXISD::TexCubeS32FloatLevel"; + case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; + case NVPTXISD::TexCubeU32FloatLevel: + return "NVPTXISD::TexCubeU32FloatLevel"; + case NVPTXISD::TexCubeArrayFloatFloat: + return "NVPTXISD::TexCubeArrayFloatFloat"; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + return "NVPTXISD::TexCubeArrayFloatFloatLevel"; + case NVPTXISD::TexCubeArrayS32Float: + return "NVPTXISD::TexCubeArrayS32Float"; + case NVPTXISD::TexCubeArrayS32FloatLevel: + return "NVPTXISD::TexCubeArrayS32FloatLevel"; + case NVPTXISD::TexCubeArrayU32Float: + return "NVPTXISD::TexCubeArrayU32Float"; + case NVPTXISD::TexCubeArrayU32FloatLevel: + return "NVPTXISD::TexCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4R2DFloatFloat: + return "NVPTXISD::Tld4R2DFloatFloat"; + case NVPTXISD::Tld4G2DFloatFloat: + return "NVPTXISD::Tld4G2DFloatFloat"; + case NVPTXISD::Tld4B2DFloatFloat: + return "NVPTXISD::Tld4B2DFloatFloat"; + case NVPTXISD::Tld4A2DFloatFloat: + return "NVPTXISD::Tld4A2DFloatFloat"; + case NVPTXISD::Tld4R2DS64Float: + return "NVPTXISD::Tld4R2DS64Float"; + case NVPTXISD::Tld4G2DS64Float: + return "NVPTXISD::Tld4G2DS64Float"; + case NVPTXISD::Tld4B2DS64Float: + return "NVPTXISD::Tld4B2DS64Float"; + case NVPTXISD::Tld4A2DS64Float: + return "NVPTXISD::Tld4A2DS64Float"; + case NVPTXISD::Tld4R2DU64Float: + return "NVPTXISD::Tld4R2DU64Float"; + case NVPTXISD::Tld4G2DU64Float: + return "NVPTXISD::Tld4G2DU64Float"; + case NVPTXISD::Tld4B2DU64Float: + return "NVPTXISD::Tld4B2DU64Float"; + case NVPTXISD::Tld4A2DU64Float: + return "NVPTXISD::Tld4A2DU64Float"; + + case NVPTXISD::TexUnified1DFloatS32: + return "NVPTXISD::TexUnified1DFloatS32"; + case NVPTXISD::TexUnified1DFloatFloat: + return "NVPTXISD::TexUnified1DFloatFloat"; + case NVPTXISD::TexUnified1DFloatFloatLevel: + return "NVPTXISD::TexUnified1DFloatFloatLevel"; + case NVPTXISD::TexUnified1DFloatFloatGrad: + return "NVPTXISD::TexUnified1DFloatFloatGrad"; + case NVPTXISD::TexUnified1DS32S32: + return "NVPTXISD::TexUnified1DS32S32"; + case NVPTXISD::TexUnified1DS32Float: + return "NVPTXISD::TexUnified1DS32Float"; + case NVPTXISD::TexUnified1DS32FloatLevel: + return "NVPTXISD::TexUnified1DS32FloatLevel"; + case NVPTXISD::TexUnified1DS32FloatGrad: + return "NVPTXISD::TexUnified1DS32FloatGrad"; + case NVPTXISD::TexUnified1DU32S32: + return "NVPTXISD::TexUnified1DU32S32"; + case NVPTXISD::TexUnified1DU32Float: + return "NVPTXISD::TexUnified1DU32Float"; + case NVPTXISD::TexUnified1DU32FloatLevel: + return "NVPTXISD::TexUnified1DU32FloatLevel"; + case NVPTXISD::TexUnified1DU32FloatGrad: + return "NVPTXISD::TexUnified1DU32FloatGrad"; + case NVPTXISD::TexUnified1DArrayFloatS32: + return "NVPTXISD::TexUnified1DArrayFloatS32"; + case NVPTXISD::TexUnified1DArrayFloatFloat: + return "NVPTXISD::TexUnified1DArrayFloatFloat"; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified1DArrayS32S32: + return "NVPTXISD::TexUnified1DArrayS32S32"; + case NVPTXISD::TexUnified1DArrayS32Float: + return "NVPTXISD::TexUnified1DArrayS32Float"; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; + case NVPTXISD::TexUnified1DArrayU32S32: + return "NVPTXISD::TexUnified1DArrayU32S32"; + case NVPTXISD::TexUnified1DArrayU32Float: + return "NVPTXISD::TexUnified1DArrayU32Float"; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; + case NVPTXISD::TexUnified2DFloatS32: + return "NVPTXISD::TexUnified2DFloatS32"; + case NVPTXISD::TexUnified2DFloatFloat: + return "NVPTXISD::TexUnified2DFloatFloat"; + case NVPTXISD::TexUnified2DFloatFloatLevel: + return "NVPTXISD::TexUnified2DFloatFloatLevel"; + case NVPTXISD::TexUnified2DFloatFloatGrad: + return "NVPTXISD::TexUnified2DFloatFloatGrad"; + case NVPTXISD::TexUnified2DS32S32: + return "NVPTXISD::TexUnified2DS32S32"; + case NVPTXISD::TexUnified2DS32Float: + return "NVPTXISD::TexUnified2DS32Float"; + case NVPTXISD::TexUnified2DS32FloatLevel: + return "NVPTXISD::TexUnified2DS32FloatLevel"; + case NVPTXISD::TexUnified2DS32FloatGrad: + return "NVPTXISD::TexUnified2DS32FloatGrad"; + case NVPTXISD::TexUnified2DU32S32: + return "NVPTXISD::TexUnified2DU32S32"; + case NVPTXISD::TexUnified2DU32Float: + return "NVPTXISD::TexUnified2DU32Float"; + case NVPTXISD::TexUnified2DU32FloatLevel: + return "NVPTXISD::TexUnified2DU32FloatLevel"; + case NVPTXISD::TexUnified2DU32FloatGrad: + return "NVPTXISD::TexUnified2DU32FloatGrad"; + case NVPTXISD::TexUnified2DArrayFloatS32: + return "NVPTXISD::TexUnified2DArrayFloatS32"; + case NVPTXISD::TexUnified2DArrayFloatFloat: + return "NVPTXISD::TexUnified2DArrayFloatFloat"; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified2DArrayS32S32: + return "NVPTXISD::TexUnified2DArrayS32S32"; + case NVPTXISD::TexUnified2DArrayS32Float: + return "NVPTXISD::TexUnified2DArrayS32Float"; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; + case NVPTXISD::TexUnified2DArrayU32S32: + return "NVPTXISD::TexUnified2DArrayU32S32"; + case NVPTXISD::TexUnified2DArrayU32Float: + return "NVPTXISD::TexUnified2DArrayU32Float"; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; + case NVPTXISD::TexUnified3DFloatS32: + return "NVPTXISD::TexUnified3DFloatS32"; + case NVPTXISD::TexUnified3DFloatFloat: + return "NVPTXISD::TexUnified3DFloatFloat"; + case NVPTXISD::TexUnified3DFloatFloatLevel: + return "NVPTXISD::TexUnified3DFloatFloatLevel"; + case NVPTXISD::TexUnified3DFloatFloatGrad: + return "NVPTXISD::TexUnified3DFloatFloatGrad"; + case NVPTXISD::TexUnified3DS32S32: + return "NVPTXISD::TexUnified3DS32S32"; + case NVPTXISD::TexUnified3DS32Float: + return "NVPTXISD::TexUnified3DS32Float"; + case NVPTXISD::TexUnified3DS32FloatLevel: + return "NVPTXISD::TexUnified3DS32FloatLevel"; + case NVPTXISD::TexUnified3DS32FloatGrad: + return "NVPTXISD::TexUnified3DS32FloatGrad"; + case NVPTXISD::TexUnified3DU32S32: + return "NVPTXISD::TexUnified3DU32S32"; + case NVPTXISD::TexUnified3DU32Float: + return "NVPTXISD::TexUnified3DU32Float"; + case NVPTXISD::TexUnified3DU32FloatLevel: + return "NVPTXISD::TexUnified3DU32FloatLevel"; + case NVPTXISD::TexUnified3DU32FloatGrad: + return "NVPTXISD::TexUnified3DU32FloatGrad"; + case NVPTXISD::TexUnifiedCubeFloatFloat: + return "NVPTXISD::TexUnifiedCubeFloatFloat"; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeS32Float: + return "NVPTXISD::TexUnifiedCubeS32Float"; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeU32Float: + return "NVPTXISD::TexUnifiedCubeU32Float"; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + return "NVPTXISD::TexUnifiedCubeArrayS32Float"; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + return "NVPTXISD::TexUnifiedCubeArrayU32Float"; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; + case NVPTXISD::Tld4UnifiedR2DS64Float: + return "NVPTXISD::Tld4UnifiedR2DS64Float"; + case NVPTXISD::Tld4UnifiedG2DS64Float: + return "NVPTXISD::Tld4UnifiedG2DS64Float"; + case NVPTXISD::Tld4UnifiedB2DS64Float: + return "NVPTXISD::Tld4UnifiedB2DS64Float"; + case NVPTXISD::Tld4UnifiedA2DS64Float: + return "NVPTXISD::Tld4UnifiedA2DS64Float"; + case NVPTXISD::Tld4UnifiedR2DU64Float: + return "NVPTXISD::Tld4UnifiedR2DU64Float"; + case NVPTXISD::Tld4UnifiedG2DU64Float: + return "NVPTXISD::Tld4UnifiedG2DU64Float"; + case NVPTXISD::Tld4UnifiedB2DU64Float: + return "NVPTXISD::Tld4UnifiedB2DU64Float"; + case NVPTXISD::Tld4UnifiedA2DU64Float: + return "NVPTXISD::Tld4UnifiedA2DU64Float"; + + case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; + case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; + case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; + case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; + case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; + case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; + case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; + case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; + case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; + case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; + case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; + + case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; + case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; + case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; + case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; + case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; + case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; + case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; + case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; + case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; + case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; + case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; + + case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; + case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; + case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; + case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; + case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; + case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; + case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; + case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; + case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; + case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; + case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; + + case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; + case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; + case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; + case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; + case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; + case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; + case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; + case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; + case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; + case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; + case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; + + case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; + case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; + case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; + case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; + case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; + case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; + case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; + case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; + case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; + case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; + case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; + + case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; + case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; + case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; + case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; + case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; + case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; + case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; + case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; + case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; + case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; + case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; + + case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; + case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; + case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; + case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; + case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; + case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; + case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; + case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; + case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; + case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; + case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; + + case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; + case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; + case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; + case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; + case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; + case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; + case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; + case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; + case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; + case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; + case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; + + case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; + case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; + case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; + case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; + case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; + case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; + case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; + case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; + case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; + case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; + case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; + + case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; + case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; + case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; + case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; + case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; + case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; + case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; + case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; + case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; + case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; + case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; + + case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; + case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; + case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; + case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; + case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; + case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; + case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; + case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; + case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; + case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; + case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; + + case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; + case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; + case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; + case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; + case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; + case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; + case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; + case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; + case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; + case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; + case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; + + case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; + case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; + case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; + case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; + case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; + case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; + case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; + case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; + case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; + case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; + case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; + + case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; + case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; + case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; + case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; + case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; + case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; + case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; + case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; + case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; + case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; + case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; + + case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; + case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; + case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; + case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; + case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; + case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; + case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; + case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; + case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; + case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; + case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; + } + return nullptr; +} + +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + if (VT == MVT::v2f16) + return TypeLegal; + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, + int Enabled, int &ExtraSteps, + bool &UseOneConst, + bool Reciprocal) const { + if (!(Enabled == ReciprocalEstimate::Enabled || + (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) + return SDValue(); + + if (ExtraSteps == ReciprocalEstimate::Unspecified) + ExtraSteps = 0; + + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + bool Ftz = useF32FTZ(DAG.getMachineFunction()); + + auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(IID, DL, MVT::i32), Operand); + }; + + // The sqrt and rsqrt refinement processes assume we always start out with an + // approximation of the rsqrt. Therefore, if we're going to do any refinement + // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing + // any refinement, we must return a regular sqrt. + if (Reciprocal || ExtraSteps > 0) { + if (VT == MVT::f32) + return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f + : Intrinsic::nvvm_rsqrt_approx_f); + else if (VT == MVT::f64) + return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); + else + return SDValue(); + } else { + if (VT == MVT::f32) + return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f + : Intrinsic::nvvm_sqrt_approx_f); + else { + // There's no sqrt.approx.f64 instruction, so we emit + // reciprocal(rsqrt(x)). This is faster than + // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain + // x * rsqrt(x).) + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), + MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); + } + } +} + +SDValue +NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT); + return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); +} + +std::string NVPTXTargetLowering::getPrototype( + const DataLayout &DL, Type *retTy, const ArgListTy &Args, + const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment, + ImmutableCallSite CS) const { + auto PtrVT = getPointerTy(DL); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return ""; + + std::stringstream O; + O << "prototype_" << uniqueCallSite << " : .callprototype "; + + if (retTy->getTypeID() == Type::VoidTyID) { + O << "()"; + } else { + O << "("; + if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { + unsigned size = 0; + if (auto *ITy = dyn_cast<IntegerType>(retTy)) { + size = ITy->getBitWidth(); + } else { + assert(retTy->isFloatingPointTy() && + "Floating point type expected here"); + size = retTy->getPrimitiveSizeInBits(); + } + // PTX ABI requires all scalar return values to be at least 32 + // bits in size. fp16 normally uses .b16 as its storage type in + // PTX, so its size must be adjusted here, too. + if (size < 32) + size = 32; + + O << ".param .b" << size << " _"; + } else if (isa<PointerType>(retTy)) { + O << ".param .b" << PtrVT.getSizeInBits() << " _"; + } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) { + auto &DL = CS.getCalledFunction()->getParent()->getDataLayout(); + O << ".param .align " << retAlignment << " .b8 _[" + << DL.getTypeAllocSize(retTy) << "]"; + } else { + llvm_unreachable("Unknown return type"); + } + O << ") "; + } + O << "_ ("; + + bool first = true; + + unsigned OIdx = 0; + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + Type *Ty = Args[i].Ty; + if (!first) { + O << ", "; + } + first = false; + + if (!Outs[OIdx].Flags.isByVal()) { + if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { + unsigned align = 0; + const CallInst *CallI = cast<CallInst>(CS.getInstruction()); + // +1 because index 0 is reserved for return type alignment + if (!getAlign(*CallI, i + 1, align)) + align = DL.getABITypeAlignment(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + // update the index for Outs + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, DL, Ty, vtparts); + if (unsigned len = vtparts.size()) + OIdx += len - 1; + continue; + } + // i8 types in IR will be i16 types in SDAG + assert((getValueType(DL, Ty) == Outs[OIdx].VT || + (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && + "type mismatch between callee prototype and arguments"); + // scalar type + unsigned sz = 0; + if (isa<IntegerType>(Ty)) { + sz = cast<IntegerType>(Ty)->getBitWidth(); + if (sz < 32) + sz = 32; + } else if (isa<PointerType>(Ty)) { + sz = PtrVT.getSizeInBits(); + } else if (Ty->isHalfTy()) + // PTX ABI requires all scalar parameters to be at least 32 + // bits in size. fp16 normally uses .b16 as its storage type + // in PTX, so its size must be adjusted here, too. + sz = 32; + else + sz = Ty->getPrimitiveSizeInBits(); + O << ".param .b" << sz << " "; + O << "_"; + continue; + } + auto *PTy = dyn_cast<PointerType>(Ty); + assert(PTy && "Param with byval attribute should be a pointer type"); + Type *ETy = PTy->getElementType(); + + unsigned align = Outs[OIdx].Flags.getByValAlign(); + unsigned sz = DL.getTypeAllocSize(ETy); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + } + O << ");"; + return O.str(); +} + +unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, + ImmutableCallSite CS, + Type *Ty, unsigned Idx, + const DataLayout &DL) const { + if (!CS) { + // CallSite is zero, fallback to ABI type alignment + return DL.getABITypeAlignment(Ty); + } + + unsigned Align = 0; + const Value *DirectCallee = CS.getCalledFunction(); + + if (!DirectCallee) { + // We don't have a direct function symbol, but that may be because of + // constant cast instructions in the call. + const Instruction *CalleeI = CS.getInstruction(); + assert(CalleeI && "Call target is not a function or derived value?"); + + // With bitcast'd call targets, the instruction will be the call + if (isa<CallInst>(CalleeI)) { + // Check if we have call alignment metadata + if (getAlign(*cast<CallInst>(CalleeI), Idx, Align)) + return Align; + + const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); + // Ignore any bitcast instructions + while (isa<ConstantExpr>(CalleeV)) { + const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); + if (!CE->isCast()) + break; + // Look through the bitcast + CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); + } + + // We have now looked past all of the bitcasts. Do we finally have a + // Function? + if (isa<Function>(CalleeV)) + DirectCallee = CalleeV; + } + } + + // Check for function alignment information if we found that the + // ultimate target is a Function + if (DirectCallee) + if (getAlign(*cast<Function>(DirectCallee), Idx, Align)) + return Align; + + // Call is indirect or alignment information is not available, fall back to + // the ABI type alignment + return DL.getABITypeAlignment(Ty); +} + +SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + ArgListTy &Args = CLI.getArgs(); + Type *RetTy = CLI.RetTy; + ImmutableCallSite CS = CLI.CS; + const DataLayout &DL = DAG.getDataLayout(); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + SDValue tempChain = Chain; + Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); + SDValue InFlag = Chain.getValue(1); + + unsigned paramCount = 0; + // Args.size() and Outs.size() need not match. + // Outs.size() will be larger + // * if there is an aggregate argument with multiple fields (each field + // showing up separately in Outs) + // * if there is a vector argument with more than typical vector-length + // elements (generally if more than 4) where each vector element is + // individually present in Outs. + // So a different index should be used for indexing into Outs/OutVals. + // See similar issue in LowerFormalArguments. + unsigned OIdx = 0; + // Declare the .params or .reg need to pass values + // to the function + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + EVT VT = Outs[OIdx].VT; + Type *Ty = Args[i].Ty; + + if (!Outs[OIdx].Flags.isByVal()) { + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); + unsigned ArgAlign = + getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + unsigned AllocSize = DL.getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + bool NeedAlign; // Does argument declaration specify alignment? + if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { + // declare .param .align <align> .b8 .param<n>[<size>]; + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + NeedAlign = true; + } else { + // declare .param .b<size> .param<n>; + if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { + // PTX ABI requires integral types to be at least 32 bits in + // size. FP16 is loaded/stored using i16, so it's handled + // here as well. + AllocSize = 4; + } + SDValue DeclareScalarParamOps[] = { + Chain, DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize * 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareScalarParamOps); + NeedAlign = false; + } + InFlag = Chain.getValue(1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + bool ExtendIntegerParam = + Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; + + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector<SDValue, 6> StoreOperands; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); + } + + EVT EltVT = VTs[j]; + SDValue StVal = OutVals[OIdx]; + if (ExtendIntegerParam) { + assert(VTs.size() == 1 && "Scalar can't have multiple parts."); + // zext/sext to i32 + StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, StVal); + } else if (EltVT.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); + } + + // Record the value to store. + StoreOperands.push_back(StVal); + + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + StoreOperands.push_back(InFlag); + + // Adjust type of the store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; + unsigned EltAlign = + NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + TheStoreType, MachinePointerInfo(), EltAlign, + MachineMemOperand::MOStore); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } + ++OIdx; + } + assert(StoreOperands.empty() && "Unfinished parameter store."); + if (VTs.size() > 0) + --OIdx; + ++paramCount; + continue; + } + + // ByVal arguments + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + auto *PTy = dyn_cast<PointerType>(Args[i].Ty); + assert(PTy && "Type of a byval parameter should be pointer"); + ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); + + // declare .param .align <align> .b8 .param<n>[<size>]; + unsigned sz = Outs[OIdx].Flags.getByValSize(); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); + // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, + // so we don't need to worry about natural alignment or not. + // See TargetLowering::LowerCallTo(). + + // Enforce minumum alignment of 4 to work around ptxas miscompile + // for sm_50+. See corresponding alignment adjustment in + // emitFunctionParamList() for details. + if (ArgAlign < 4) + ArgAlign = 4; + SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(sz, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + InFlag = Chain.getValue(1); + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + EVT elemtype = VTs[j]; + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + auto PtrVT = getPointerTy(DL); + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], + DAG.getConstant(curOffset, dl, PtrVT)); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); + } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(curOffset, dl, MVT::i32), + theVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo(), /* Align */ 0, + MachineMemOperand::MOStore); + + InFlag = Chain.getValue(1); + } + ++paramCount; + } + + GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); + unsigned retAlignment = 0; + + // Handle Result + if (Ins.size() > 0) { + SmallVector<EVT, 16> resvtparts; + ComputeValueVTs(*this, DL, RetTy, resvtparts); + + // Declare + // .param .align 16 .b8 retval0[<size-in-bytes>], or + // .param .b<size-in-bits> retval0 + unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); + // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for + // these three types to match the logic in + // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. + // Plus, this behavior is consistent with nvcc's. + if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || + (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { + // Scalar needs to be at least 32bit wide + if (resultsz < 32) + resultsz = 32; + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(resultsz, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, + DeclareRetOps); + InFlag = Chain.getValue(1); + } else { + retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, + DAG.getConstant(retAlignment, dl, MVT::i32), + DAG.getConstant(resultsz / 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, + DeclareRetOps); + InFlag = Chain.getValue(1); + } + } + + if (!Func) { + // This is indirect function call case : PTX requires a prototype of the + // form + // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); + // to be emitted, and the label has to used as the last arg of call + // instruction. + // The prototype is embedded in a string and put as the operand for a + // CallPrototype SDNode which will print out to the value of the string. + SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); + std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); + const char *ProtoStr = + nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); + SDValue ProtoOps[] = { + Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, + }; + Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); + InFlag = Chain.getValue(1); + } + // Op to just print "call" + SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrintCallOps[] = { + Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag + }; + // We model convergent calls as separate opcodes. + unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; + if (CLI.IsConvergent) + Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni + : NVPTXISD::PrintConvergentCall; + Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); + InFlag = Chain.getValue(1); + + // Ops to print out the function name + SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallVoidOps[] = { Chain, Callee, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); + InFlag = Chain.getValue(1); + + // Ops to print out the param list + SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgBeginOps[] = { Chain, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, + CallArgBeginOps); + InFlag = Chain.getValue(1); + + for (unsigned i = 0, e = paramCount; i != e; ++i) { + unsigned opcode; + if (i == (e - 1)) + opcode = NVPTXISD::LastCallArg; + else + opcode = NVPTXISD::CallArg; + SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(i, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); + InFlag = Chain.getValue(1); + } + SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgEndOps[] = { Chain, + DAG.getConstant(Func ? 1 : 0, dl, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); + InFlag = Chain.getValue(1); + + if (!Func) { + SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrototypeOps[] = { Chain, + DAG.getConstant(uniqueCallSite, dl, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); + InFlag = Chain.getValue(1); + } + + // Generate loads from param memory/moves from registers for result + if (Ins.size() > 0) { + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); + + unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); + + SmallVector<EVT, 6> LoadVTs; + int VecIdx = -1; // Index of the first element of the vector. + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + bool needTruncate = false; + EVT TheLoadType = VTs[i]; + EVT EltType = Ins[i].VT; + unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); + if (ExtendIntegerRetVal) { + TheLoadType = MVT::i32; + EltType = MVT::i32; + needTruncate = true; + } else if (TheLoadType.getSizeInBits() < 16) { + if (VTs[i].isInteger()) + needTruncate = true; + EltType = MVT::i16; + } + + // Record index of the very first element of the vector. + if (VectorInfo[i] & PVF_FIRST) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + VecIdx = i; + } + + LoadVTs.push_back(EltType); + + if (VectorInfo[i] & PVF_LAST) { + unsigned NumElts = LoadVTs.size(); + LoadVTs.push_back(MVT::Other); + LoadVTs.push_back(MVT::Glue); + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::LoadParam; + break; + case 2: + Op = NVPTXISD::LoadParamV2; + break; + case 4: + Op = NVPTXISD::LoadParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + SDValue LoadOperands[] = { + Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; + SDValue RetVal = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, + MachinePointerInfo(), EltAlign, + MachineMemOperand::MOLoad); + + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Ret = RetVal.getValue(j); + if (needTruncate) + Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); + InVals.push_back(Ret); + } + Chain = RetVal.getValue(NumElts); + InFlag = RetVal.getValue(NumElts + 1); + + // Cleanup + VecIdx = -1; + LoadVTs.clear(); + } + } + } + + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(uniqueCallSite, dl, true), + DAG.getIntPtrConstant(uniqueCallSite + 1, dl, + true), + InFlag, dl); + uniqueCallSite++; + + // set isTailCall to false for now, until we figure out how to express + // tail call optimization in PTX + isTailCall = false; + return Chain; +} + +// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() +// (see LegalizeDAG.cpp). This is slow and uses local memory. +// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 +SDValue +NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + SmallVector<SDValue, 8> Ops; + unsigned NumOperands = Node->getNumOperands(); + for (unsigned i = 0; i < NumOperands; ++i) { + SDValue SubOp = Node->getOperand(i); + EVT VVT = SubOp.getNode()->getValueType(0); + EVT EltVT = VVT.getVectorElementType(); + unsigned NumSubElem = VVT.getVectorNumElements(); + for (unsigned j = 0; j < NumSubElem; ++j) { + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, + DAG.getIntPtrConstant(j, dl))); + } + } + return DAG.getBuildVector(Node->getValueType(0), dl, Ops); +} + +// We can init constant f16x2 with a single .b32 move. Normally it +// would get lowered as two constant loads and vector-packing move. +// mov.b16 %h1, 0x4000; +// mov.b16 %h2, 0x3C00; +// mov.b32 %hh2, {%h2, %h1}; +// Instead we want just a constant move: +// mov.b32 %hh2, 0x40003C00 +// +// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 +// generates good SASS in both cases. +SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + //return Op; + if (!(Op->getValueType(0) == MVT::v2f16 && + isa<ConstantFPSDNode>(Op->getOperand(0)) && + isa<ConstantFPSDNode>(Op->getOperand(1)))) + return Op; + + APInt E0 = + cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); + APInt E1 = + cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); + SDValue Const = + DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); +} + +SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Index = Op->getOperand(1); + // Constant index will be matched by tablegen. + if (isa<ConstantSDNode>(Index.getNode())) + return Op; + + // Extract individual elements and select one of them. + SDValue Vector = Op->getOperand(0); + EVT VectorVT = Vector.getValueType(); + assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); + EVT EltVT = VectorVT.getVectorElementType(); + + SDLoc dl(Op.getNode()); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(0, dl)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(1, dl)); + return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, + ISD::CondCode::SETEQ); +} + +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && STI.getSmVersion() >= 35) { + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), + ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32), + ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && STI.getSmVersion() >= 35) { + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), + ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32), + ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +SDValue +NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + case ISD::RETURNADDR: + return SDValue(); + case ISD::FRAMEADDR: + return SDValue(); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: + return Op; + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: + return Op; + case ISD::EXTRACT_VECTOR_ELT: + return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: + return LowerCONCAT_VECTORS(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); + case ISD::LOAD: + return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); + case ISD::SELECT: + return LowerSelect(Op, DAG); + default: + llvm_unreachable("Custom lowering not defined for operation"); + } +} + +SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { + SDValue Op0 = Op->getOperand(0); + SDValue Op1 = Op->getOperand(1); + SDValue Op2 = Op->getOperand(2); + SDLoc DL(Op.getNode()); + + assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); + + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); + + return Trunc; +} + +SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::i1) + return LowerLOADi1(Op, DAG); + + // v2f16 is legal, so we can't rely on legalizer to handle unaligned + // loads and have to handle it here. + if (Op.getValueType() == MVT::v2f16) { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT MemVT = Load->getMemoryVT(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + Load->getAddressSpace(), Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, SDLoc(Op)); + } + } + + return SDValue(); +} + +// v = ld i1* addr +// => +// v1 = ld i8* addr (-> i16) +// v = trunc i16 to i1 +SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + LoadSDNode *LD = cast<LoadSDNode>(Node); + SDLoc dl(Node); + assert(LD->getExtensionType() == ISD::NON_EXTLOAD); + assert(Node->getValueType(0) == MVT::i1 && + "Custom lowering for i1 load only"); + SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), + LD->getPointerInfo(), LD->getAlignment(), + LD->getMemOperand()->getFlags()); + SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); + // The legalizer (the caller) is expecting two values from the legalized + // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() + // in LegalizeDAG.cpp which also uses MergeValues. + SDValue Ops[] = { result, LD->getChain() }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT VT = Store->getMemoryVT(); + + if (VT == MVT::i1) + return LowerSTOREi1(Op, DAG); + + // v2f16 is legal, so we can't rely on legalizer to handle unaligned + // stores and have to handle it here. + if (VT == MVT::v2f16 && + !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + Store->getAddressSpace(), Store->getAlignment())) + return expandUnalignedStore(Store, DAG); + + if (VT.isVector()) + return LowerSTOREVector(Op, DAG); + + return SDValue(); +} + +SDValue +NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { + SDNode *N = Op.getNode(); + SDValue Val = N->getOperand(1); + SDLoc DL(N); + EVT ValVT = Val.getValueType(); + + if (ValVT.isVector()) { + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 stores of <2 x double> here + // but I'm leaving that as a TODO for now. + if (!ValVT.isSimple()) + return SDValue(); + switch (ValVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f16: + case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> + // This is a "native" vector type + break; + } + + MemSDNode *MemSD = cast<MemSDNode>(N); + const DataLayout &TD = DAG.getDataLayout(); + + unsigned Align = MemSD->getAlignment(); + unsigned PrefAlign = + TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } + + unsigned Opcode = 0; + EVT EltVT = ValVT.getVectorElementType(); + unsigned NumElts = ValVT.getVectorNumElements(); + + // Since StoreV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // stored type to i16 and propagate the "real" type as the memory type. + bool NeedExt = false; + if (EltVT.getSizeInBits() < 16) + NeedExt = true; + + bool StoreF16x2 = false; + switch (NumElts) { + default: + return SDValue(); + case 2: + Opcode = NVPTXISD::StoreV2; + break; + case 4: + Opcode = NVPTXISD::StoreV4; + break; + case 8: + // v8f16 is a special case. PTX doesn't have st.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // store them with st.v4.b32. + assert(EltVT == MVT::f16 && "Wrong type for the vector."); + Opcode = NVPTXISD::StoreV4; + StoreF16x2 = true; + break; + } + + SmallVector<SDValue, 8> Ops; + + // First is the chain + Ops.push_back(N->getOperand(0)); + + if (StoreF16x2) { + // Combine f16,f16 -> v2f16 + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2 + 1, DL)); + SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); + Ops.push_back(V2); + } + } else { + // Then the split values + for (unsigned i = 0; i < NumElts; ++i) { + SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(i, DL)); + if (NeedExt) + ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + Ops.push_back(ExtVal); + } + } + + // Then any remaining arguments + Ops.append(N->op_begin() + 2, N->op_end()); + + SDValue NewSt = + DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + // return DCI.CombineTo(N, NewSt, true); + return NewSt; + } + + return SDValue(); +} + +// st i1 v, addr +// => +// v1 = zxt v to i16 +// st.u8 i16, addr +SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + StoreSDNode *ST = cast<StoreSDNode>(Node); + SDValue Tmp1 = ST->getChain(); + SDValue Tmp2 = ST->getBasePtr(); + SDValue Tmp3 = ST->getValue(); + assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); + Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); + SDValue Result = + DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, + ST->getAlignment(), ST->getMemOperand()->getFlags()); + return Result; +} + +SDValue +NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { + std::string ParamSym; + raw_string_ostream ParamStr(ParamSym); + + ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; + ParamStr.flush(); + + std::string *SavedStr = + nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); + return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); +} + +// Check to see if the kernel argument is image*_t or sampler_t + +static bool isImageOrSamplerVal(const Value *arg, const Module *context) { + static const char *const specialTypes[] = { "struct._image2d_t", + "struct._image3d_t", + "struct._sampler_t" }; + + Type *Ty = arg->getType(); + auto *PTy = dyn_cast<PointerType>(Ty); + + if (!PTy) + return false; + + if (!context) + return false; + + auto *STy = dyn_cast<StructType>(PTy->getElementType()); + if (!STy || STy->isLiteral()) + return false; + + return std::find(std::begin(specialTypes), std::end(specialTypes), + STy->getName()) != std::end(specialTypes); +} + +SDValue NVPTXTargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + const DataLayout &DL = DAG.getDataLayout(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + const Function *F = &MF.getFunction(); + const AttributeList &PAL = F->getAttributes(); + const TargetLowering *TLI = STI.getTargetLowering(); + + SDValue Root = DAG.getRoot(); + std::vector<SDValue> OutChains; + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + std::vector<Type *> argTypes; + std::vector<const Argument *> theArgs; + for (const Argument &I : F->args()) { + theArgs.push_back(&I); + argTypes.push_back(I.getType()); + } + // argTypes.size() (or theArgs.size()) and Ins.size() need not match. + // Ins.size() will be larger + // * if there is an aggregate argument with multiple fields (each field + // showing up separately in Ins) + // * if there is a vector argument with more than typical vector-length + // elements (generally if more than 4) where each vector element is + // individually present in Ins. + // So a different index should be used for indexing into Ins. + // See similar issue in LowerCall. + unsigned InsIdx = 0; + + int idx = 0; + for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { + Type *Ty = argTypes[i]; + + // If the kernel argument is image*_t or sampler_t, convert it to + // a i32 constant holding the parameter position. This can later + // matched in the AsmPrinter to output the correct mangled name. + if (isImageOrSamplerVal( + theArgs[i], + (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() + : nullptr))) { + assert(isKernelFunction(*F) && + "Only kernels can have image/sampler params"); + InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); + continue; + } + + if (theArgs[i]->use_empty()) { + // argument is dead + if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { + SmallVector<EVT, 16> vtparts; + + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); + assert(vtparts.size() > 0 && "empty aggregate type not expected"); + for (unsigned parti = 0, parte = vtparts.size(); parti != parte; + ++parti) { + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + ++InsIdx; + } + if (vtparts.size() > 0) + --InsIdx; + continue; + } + if (Ty->isVectorTy()) { + EVT ObjectVT = getValueType(DL, Ty); + unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); + for (unsigned parti = 0; parti < NumRegs; ++parti) { + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + ++InsIdx; + } + if (NumRegs > 0) + --InsIdx; + continue; + } + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + continue; + } + + // In the following cases, assign a node order of "idx+1" + // to newly created nodes. The SDNodes for params have to + // appear in the same order as their order of appearance + // in the original function. "idx+1" holds that order. + if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { + bool aggregateIsPacked = false; + if (StructType *STy = dyn_cast<StructType>(Ty)) + aggregateIsPacked = STy->isPacked(); + + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); + assert(VTs.size() > 0 && "Unexpected empty type."); + auto VectorInfo = + VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); + + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + int VecIdx = -1; // Index of the first element of the current vector. + for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { + if (VectorInfo[parti] & PVF_FIRST) { + assert(VecIdx == -1 && "Orphaned vector."); + VecIdx = parti; + } + + // That's the last element of this store op. + if (VectorInfo[parti] & PVF_LAST) { + unsigned NumElts = parti - VecIdx + 1; + EVT EltVT = VTs[parti]; + // i1 is loaded/stored as i8. + EVT LoadVT = EltVT; + if (EltVT == MVT::i1) + LoadVT = MVT::i8; + else if (EltVT == MVT::v2f16) + // getLoad needs a vector type, but it can't handle + // vectors which contain v2f16 elements. So we must load + // using i32 here and then bitcast back. + LoadVT = MVT::i32; + + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); + SDValue VecAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, Arg, + DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); + Value *srcValue = Constant::getNullValue(PointerType::get( + EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + SDValue P = + DAG.getLoad(VecVT, dl, Root, VecAddr, + MachinePointerInfo(srcValue), aggregateIsPacked, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + if (P.getNode()) + P.getNode()->setIROrder(idx + 1); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, + DAG.getIntPtrConstant(j, dl)); + // We've loaded i1 as an i8 and now must truncate it back to i1 + if (EltVT == MVT::i1) + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); + // v2f16 was loaded as an i32. Now we must bitcast it back. + else if (EltVT == MVT::v2f16) + Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); + // Extend the element if necessary (e.g. an i8 is loaded + // into an i16 register) + if (Ins[InsIdx].VT.isInteger() && + Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { + unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); + } + InVals.push_back(Elt); + } + + // Reset vector tracking state. + VecIdx = -1; + } + ++InsIdx; + } + if (VTs.size() > 0) + --InsIdx; + continue; + } + + // Param has ByVal attribute + // Return MoveParam(param symbol). + // Ideally, the param symbol can be returned directly, + // but when SDNode builder decides to use it in a CopyToReg(), + // machine instruction fails because TargetExternalSymbol + // (not lowered) is target dependent, and CopyToReg assumes + // the source is lowered. + EVT ObjectVT = getValueType(DL, Ty); + assert(ObjectVT == Ins[InsIdx].VT && + "Ins type did not match function type"); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); + if (p.getNode()) + p.getNode()->setIROrder(idx + 1); + InVals.push_back(p); + } + + // Clang will check explicit VarArg and issue error if any. However, Clang + // will let code with + // implicit var arg like f() pass. See bug 617733. + // We treat this case as if the arg list is empty. + // if (F.isVarArg()) { + // assert(0 && "VarArg not supported yet!"); + //} + + if (!OutChains.empty()) + DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); + + return Chain; +} + +SDValue +NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + Type *RetTy = MF.getFunction().getReturnType(); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + const DataLayout DL = DAG.getDataLayout(); + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + + auto VectorInfo = VectorizePTXValueVTs( + VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + SmallVector<SDValue, 6> StoreOperands; + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + + SDValue RetVal = OutVals[i]; + if (ExtendIntegerRetVal) { + RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, RetVal); + } else if (RetVal.getValueSizeInBits() < 16) { + // Use 16-bit registers for small load-stores as it's the + // smallest general purpose register size supported by NVPTX. + RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); + } + + // Record the value to return. + StoreOperands.push_back(RetVal); + + // That's the last element of this store op. + if (VectorInfo[i] & PVF_LAST) { + NVPTXISD::NodeType Op; + unsigned NumElts = StoreOperands.size() - 2; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreRetval; + break; + case 2: + Op = NVPTXISD::StoreRetvalV2; + break; + case 4: + Op = NVPTXISD::StoreRetvalV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + // Adjust type of load/store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), + StoreOperands, TheStoreType, + MachinePointerInfo(), /* Align */ 1, + MachineMemOperand::MOStore); + // Cleanup vector state. + StoreOperands.clear(); + } + } + + return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); +} + +void NVPTXTargetLowering::LowerAsmOperandForConstraint( + SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, + SelectionDAG &DAG) const { + if (Constraint.length() > 1) + return; + else + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +static unsigned getOpcForTextureInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + + case Intrinsic::nvvm_tex_1d_v4f32_s32: + return NVPTXISD::Tex1DFloatS32; + case Intrinsic::nvvm_tex_1d_v4f32_f32: + return NVPTXISD::Tex1DFloatFloat; + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_v4s32_s32: + return NVPTXISD::Tex1DS32S32; + case Intrinsic::nvvm_tex_1d_v4s32_f32: + return NVPTXISD::Tex1DS32Float; + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + return NVPTXISD::Tex1DS32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + return NVPTXISD::Tex1DS32FloatGrad; + case Intrinsic::nvvm_tex_1d_v4u32_s32: + return NVPTXISD::Tex1DU32S32; + case Intrinsic::nvvm_tex_1d_v4u32_f32: + return NVPTXISD::Tex1DU32Float; + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + return NVPTXISD::Tex1DU32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + return NVPTXISD::Tex1DU32FloatGrad; + + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + return NVPTXISD::Tex1DArrayFloatS32; + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloat; + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + return NVPTXISD::Tex1DArrayS32S32; + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + return NVPTXISD::Tex1DArrayS32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + return NVPTXISD::Tex1DArrayU32S32; + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + return NVPTXISD::Tex1DArrayU32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_v4f32_s32: + return NVPTXISD::Tex2DFloatS32; + case Intrinsic::nvvm_tex_2d_v4f32_f32: + return NVPTXISD::Tex2DFloatFloat; + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_v4s32_s32: + return NVPTXISD::Tex2DS32S32; + case Intrinsic::nvvm_tex_2d_v4s32_f32: + return NVPTXISD::Tex2DS32Float; + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + return NVPTXISD::Tex2DS32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + return NVPTXISD::Tex2DS32FloatGrad; + case Intrinsic::nvvm_tex_2d_v4u32_s32: + return NVPTXISD::Tex2DU32S32; + case Intrinsic::nvvm_tex_2d_v4u32_f32: + return NVPTXISD::Tex2DU32Float; + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + return NVPTXISD::Tex2DU32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + return NVPTXISD::Tex2DU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + return NVPTXISD::Tex2DArrayFloatS32; + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloat; + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + return NVPTXISD::Tex2DArrayS32S32; + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + return NVPTXISD::Tex2DArrayS32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + return NVPTXISD::Tex2DArrayU32S32; + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + return NVPTXISD::Tex2DArrayU32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_3d_v4f32_s32: + return NVPTXISD::Tex3DFloatS32; + case Intrinsic::nvvm_tex_3d_v4f32_f32: + return NVPTXISD::Tex3DFloatFloat; + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatGrad; + case Intrinsic::nvvm_tex_3d_v4s32_s32: + return NVPTXISD::Tex3DS32S32; + case Intrinsic::nvvm_tex_3d_v4s32_f32: + return NVPTXISD::Tex3DS32Float; + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + return NVPTXISD::Tex3DS32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + return NVPTXISD::Tex3DS32FloatGrad; + case Intrinsic::nvvm_tex_3d_v4u32_s32: + return NVPTXISD::Tex3DU32S32; + case Intrinsic::nvvm_tex_3d_v4u32_f32: + return NVPTXISD::Tex3DU32Float; + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + return NVPTXISD::Tex3DU32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + return NVPTXISD::Tex3DU32FloatGrad; + + case Intrinsic::nvvm_tex_cube_v4f32_f32: + return NVPTXISD::TexCubeFloatFloat; + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + return NVPTXISD::TexCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_v4s32_f32: + return NVPTXISD::TexCubeS32Float; + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + return NVPTXISD::TexCubeS32FloatLevel; + case Intrinsic::nvvm_tex_cube_v4u32_f32: + return NVPTXISD::TexCubeU32Float; + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + return NVPTXISD::TexCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + return NVPTXISD::TexCubeArrayS32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + return NVPTXISD::TexCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + return NVPTXISD::TexCubeArrayU32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + return NVPTXISD::TexCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + return NVPTXISD::Tld4R2DFloatFloat; + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + return NVPTXISD::Tld4G2DFloatFloat; + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + return NVPTXISD::Tld4B2DFloatFloat; + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + return NVPTXISD::Tld4A2DFloatFloat; + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + return NVPTXISD::Tld4R2DS64Float; + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + return NVPTXISD::Tld4G2DS64Float; + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + return NVPTXISD::Tld4B2DS64Float; + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + return NVPTXISD::Tld4A2DS64Float; + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + return NVPTXISD::Tld4R2DU64Float; + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + return NVPTXISD::Tld4G2DU64Float; + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + return NVPTXISD::Tld4B2DU64Float; + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + return NVPTXISD::Tld4A2DU64Float; + + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + return NVPTXISD::TexUnified1DFloatS32; + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + return NVPTXISD::TexUnified1DS32S32; + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + return NVPTXISD::TexUnified1DS32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + return NVPTXISD::TexUnified1DU32S32; + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + return NVPTXISD::TexUnified1DU32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + return NVPTXISD::TexUnified1DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + return NVPTXISD::TexUnified1DArrayS32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + return NVPTXISD::TexUnified1DArrayU32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + return NVPTXISD::TexUnified2DFloatS32; + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + return NVPTXISD::TexUnified2DS32S32; + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + return NVPTXISD::TexUnified2DS32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + return NVPTXISD::TexUnified2DU32S32; + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + return NVPTXISD::TexUnified2DU32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + return NVPTXISD::TexUnified2DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + return NVPTXISD::TexUnified2DArrayS32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + return NVPTXISD::TexUnified2DArrayU32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + return NVPTXISD::TexUnified3DFloatS32; + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloat; + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + return NVPTXISD::TexUnified3DS32S32; + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + return NVPTXISD::TexUnified3DS32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + return NVPTXISD::TexUnified3DU32S32; + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + return NVPTXISD::TexUnified3DU32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedR2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedG2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedB2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedA2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedR2DS64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedG2DS64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedB2DS64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedA2DS64Float; + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedR2DU64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedG2DU64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedB2DU64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedA2DU64Float; + } +} + +static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + case Intrinsic::nvvm_suld_1d_i8_clamp: + return NVPTXISD::Suld1DI8Clamp; + case Intrinsic::nvvm_suld_1d_i16_clamp: + return NVPTXISD::Suld1DI16Clamp; + case Intrinsic::nvvm_suld_1d_i32_clamp: + return NVPTXISD::Suld1DI32Clamp; + case Intrinsic::nvvm_suld_1d_i64_clamp: + return NVPTXISD::Suld1DI64Clamp; + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + return NVPTXISD::Suld1DV2I8Clamp; + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + return NVPTXISD::Suld1DV2I16Clamp; + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + return NVPTXISD::Suld1DV2I32Clamp; + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + return NVPTXISD::Suld1DV2I64Clamp; + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + return NVPTXISD::Suld1DV4I8Clamp; + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + return NVPTXISD::Suld1DV4I16Clamp; + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + return NVPTXISD::Suld1DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + return NVPTXISD::Suld1DArrayI8Clamp; + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + return NVPTXISD::Suld1DArrayI16Clamp; + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + return NVPTXISD::Suld1DArrayI32Clamp; + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + return NVPTXISD::Suld1DArrayI64Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + return NVPTXISD::Suld1DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + return NVPTXISD::Suld1DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + return NVPTXISD::Suld1DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + return NVPTXISD::Suld1DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + return NVPTXISD::Suld1DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + return NVPTXISD::Suld1DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + return NVPTXISD::Suld1DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_2d_i8_clamp: + return NVPTXISD::Suld2DI8Clamp; + case Intrinsic::nvvm_suld_2d_i16_clamp: + return NVPTXISD::Suld2DI16Clamp; + case Intrinsic::nvvm_suld_2d_i32_clamp: + return NVPTXISD::Suld2DI32Clamp; + case Intrinsic::nvvm_suld_2d_i64_clamp: + return NVPTXISD::Suld2DI64Clamp; + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + return NVPTXISD::Suld2DV2I8Clamp; + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + return NVPTXISD::Suld2DV2I16Clamp; + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + return NVPTXISD::Suld2DV2I32Clamp; + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + return NVPTXISD::Suld2DV2I64Clamp; + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + return NVPTXISD::Suld2DV4I8Clamp; + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + return NVPTXISD::Suld2DV4I16Clamp; + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + return NVPTXISD::Suld2DV4I32Clamp; + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + return NVPTXISD::Suld2DArrayI8Clamp; + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + return NVPTXISD::Suld2DArrayI16Clamp; + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + return NVPTXISD::Suld2DArrayI32Clamp; + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + return NVPTXISD::Suld2DArrayI64Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + return NVPTXISD::Suld2DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + return NVPTXISD::Suld2DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + return NVPTXISD::Suld2DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + return NVPTXISD::Suld2DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + return NVPTXISD::Suld2DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + return NVPTXISD::Suld2DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + return NVPTXISD::Suld2DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_3d_i8_clamp: + return NVPTXISD::Suld3DI8Clamp; + case Intrinsic::nvvm_suld_3d_i16_clamp: + return NVPTXISD::Suld3DI16Clamp; + case Intrinsic::nvvm_suld_3d_i32_clamp: + return NVPTXISD::Suld3DI32Clamp; + case Intrinsic::nvvm_suld_3d_i64_clamp: + return NVPTXISD::Suld3DI64Clamp; + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + return NVPTXISD::Suld3DV2I8Clamp; + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + return NVPTXISD::Suld3DV2I16Clamp; + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + return NVPTXISD::Suld3DV2I32Clamp; + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + return NVPTXISD::Suld3DV2I64Clamp; + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + return NVPTXISD::Suld3DV4I8Clamp; + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + return NVPTXISD::Suld3DV4I16Clamp; + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + return NVPTXISD::Suld3DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_i8_trap: + return NVPTXISD::Suld1DI8Trap; + case Intrinsic::nvvm_suld_1d_i16_trap: + return NVPTXISD::Suld1DI16Trap; + case Intrinsic::nvvm_suld_1d_i32_trap: + return NVPTXISD::Suld1DI32Trap; + case Intrinsic::nvvm_suld_1d_i64_trap: + return NVPTXISD::Suld1DI64Trap; + case Intrinsic::nvvm_suld_1d_v2i8_trap: + return NVPTXISD::Suld1DV2I8Trap; + case Intrinsic::nvvm_suld_1d_v2i16_trap: + return NVPTXISD::Suld1DV2I16Trap; + case Intrinsic::nvvm_suld_1d_v2i32_trap: + return NVPTXISD::Suld1DV2I32Trap; + case Intrinsic::nvvm_suld_1d_v2i64_trap: + return NVPTXISD::Suld1DV2I64Trap; + case Intrinsic::nvvm_suld_1d_v4i8_trap: + return NVPTXISD::Suld1DV4I8Trap; + case Intrinsic::nvvm_suld_1d_v4i16_trap: + return NVPTXISD::Suld1DV4I16Trap; + case Intrinsic::nvvm_suld_1d_v4i32_trap: + return NVPTXISD::Suld1DV4I32Trap; + case Intrinsic::nvvm_suld_1d_array_i8_trap: + return NVPTXISD::Suld1DArrayI8Trap; + case Intrinsic::nvvm_suld_1d_array_i16_trap: + return NVPTXISD::Suld1DArrayI16Trap; + case Intrinsic::nvvm_suld_1d_array_i32_trap: + return NVPTXISD::Suld1DArrayI32Trap; + case Intrinsic::nvvm_suld_1d_array_i64_trap: + return NVPTXISD::Suld1DArrayI64Trap; + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + return NVPTXISD::Suld1DArrayV2I8Trap; + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + return NVPTXISD::Suld1DArrayV2I16Trap; + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + return NVPTXISD::Suld1DArrayV2I32Trap; + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + return NVPTXISD::Suld1DArrayV2I64Trap; + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + return NVPTXISD::Suld1DArrayV4I8Trap; + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + return NVPTXISD::Suld1DArrayV4I16Trap; + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + return NVPTXISD::Suld1DArrayV4I32Trap; + case Intrinsic::nvvm_suld_2d_i8_trap: + return NVPTXISD::Suld2DI8Trap; + case Intrinsic::nvvm_suld_2d_i16_trap: + return NVPTXISD::Suld2DI16Trap; + case Intrinsic::nvvm_suld_2d_i32_trap: + return NVPTXISD::Suld2DI32Trap; + case Intrinsic::nvvm_suld_2d_i64_trap: + return NVPTXISD::Suld2DI64Trap; + case Intrinsic::nvvm_suld_2d_v2i8_trap: + return NVPTXISD::Suld2DV2I8Trap; + case Intrinsic::nvvm_suld_2d_v2i16_trap: + return NVPTXISD::Suld2DV2I16Trap; + case Intrinsic::nvvm_suld_2d_v2i32_trap: + return NVPTXISD::Suld2DV2I32Trap; + case Intrinsic::nvvm_suld_2d_v2i64_trap: + return NVPTXISD::Suld2DV2I64Trap; + case Intrinsic::nvvm_suld_2d_v4i8_trap: + return NVPTXISD::Suld2DV4I8Trap; + case Intrinsic::nvvm_suld_2d_v4i16_trap: + return NVPTXISD::Suld2DV4I16Trap; + case Intrinsic::nvvm_suld_2d_v4i32_trap: + return NVPTXISD::Suld2DV4I32Trap; + case Intrinsic::nvvm_suld_2d_array_i8_trap: + return NVPTXISD::Suld2DArrayI8Trap; + case Intrinsic::nvvm_suld_2d_array_i16_trap: + return NVPTXISD::Suld2DArrayI16Trap; + case Intrinsic::nvvm_suld_2d_array_i32_trap: + return NVPTXISD::Suld2DArrayI32Trap; + case Intrinsic::nvvm_suld_2d_array_i64_trap: + return NVPTXISD::Suld2DArrayI64Trap; + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + return NVPTXISD::Suld2DArrayV2I8Trap; + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + return NVPTXISD::Suld2DArrayV2I16Trap; + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + return NVPTXISD::Suld2DArrayV2I32Trap; + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + return NVPTXISD::Suld2DArrayV2I64Trap; + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + return NVPTXISD::Suld2DArrayV4I8Trap; + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + return NVPTXISD::Suld2DArrayV4I16Trap; + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + return NVPTXISD::Suld2DArrayV4I32Trap; + case Intrinsic::nvvm_suld_3d_i8_trap: + return NVPTXISD::Suld3DI8Trap; + case Intrinsic::nvvm_suld_3d_i16_trap: + return NVPTXISD::Suld3DI16Trap; + case Intrinsic::nvvm_suld_3d_i32_trap: + return NVPTXISD::Suld3DI32Trap; + case Intrinsic::nvvm_suld_3d_i64_trap: + return NVPTXISD::Suld3DI64Trap; + case Intrinsic::nvvm_suld_3d_v2i8_trap: + return NVPTXISD::Suld3DV2I8Trap; + case Intrinsic::nvvm_suld_3d_v2i16_trap: + return NVPTXISD::Suld3DV2I16Trap; + case Intrinsic::nvvm_suld_3d_v2i32_trap: + return NVPTXISD::Suld3DV2I32Trap; + case Intrinsic::nvvm_suld_3d_v2i64_trap: + return NVPTXISD::Suld3DV2I64Trap; + case Intrinsic::nvvm_suld_3d_v4i8_trap: + return NVPTXISD::Suld3DV4I8Trap; + case Intrinsic::nvvm_suld_3d_v4i16_trap: + return NVPTXISD::Suld3DV4I16Trap; + case Intrinsic::nvvm_suld_3d_v4i32_trap: + return NVPTXISD::Suld3DV4I32Trap; + case Intrinsic::nvvm_suld_1d_i8_zero: + return NVPTXISD::Suld1DI8Zero; + case Intrinsic::nvvm_suld_1d_i16_zero: + return NVPTXISD::Suld1DI16Zero; + case Intrinsic::nvvm_suld_1d_i32_zero: + return NVPTXISD::Suld1DI32Zero; + case Intrinsic::nvvm_suld_1d_i64_zero: + return NVPTXISD::Suld1DI64Zero; + case Intrinsic::nvvm_suld_1d_v2i8_zero: + return NVPTXISD::Suld1DV2I8Zero; + case Intrinsic::nvvm_suld_1d_v2i16_zero: + return NVPTXISD::Suld1DV2I16Zero; + case Intrinsic::nvvm_suld_1d_v2i32_zero: + return NVPTXISD::Suld1DV2I32Zero; + case Intrinsic::nvvm_suld_1d_v2i64_zero: + return NVPTXISD::Suld1DV2I64Zero; + case Intrinsic::nvvm_suld_1d_v4i8_zero: + return NVPTXISD::Suld1DV4I8Zero; + case Intrinsic::nvvm_suld_1d_v4i16_zero: + return NVPTXISD::Suld1DV4I16Zero; + case Intrinsic::nvvm_suld_1d_v4i32_zero: + return NVPTXISD::Suld1DV4I32Zero; + case Intrinsic::nvvm_suld_1d_array_i8_zero: + return NVPTXISD::Suld1DArrayI8Zero; + case Intrinsic::nvvm_suld_1d_array_i16_zero: + return NVPTXISD::Suld1DArrayI16Zero; + case Intrinsic::nvvm_suld_1d_array_i32_zero: + return NVPTXISD::Suld1DArrayI32Zero; + case Intrinsic::nvvm_suld_1d_array_i64_zero: + return NVPTXISD::Suld1DArrayI64Zero; + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + return NVPTXISD::Suld1DArrayV2I8Zero; + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + return NVPTXISD::Suld1DArrayV2I16Zero; + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + return NVPTXISD::Suld1DArrayV2I32Zero; + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + return NVPTXISD::Suld1DArrayV2I64Zero; + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + return NVPTXISD::Suld1DArrayV4I8Zero; + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + return NVPTXISD::Suld1DArrayV4I16Zero; + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + return NVPTXISD::Suld1DArrayV4I32Zero; + case Intrinsic::nvvm_suld_2d_i8_zero: + return NVPTXISD::Suld2DI8Zero; + case Intrinsic::nvvm_suld_2d_i16_zero: + return NVPTXISD::Suld2DI16Zero; + case Intrinsic::nvvm_suld_2d_i32_zero: + return NVPTXISD::Suld2DI32Zero; + case Intrinsic::nvvm_suld_2d_i64_zero: + return NVPTXISD::Suld2DI64Zero; + case Intrinsic::nvvm_suld_2d_v2i8_zero: + return NVPTXISD::Suld2DV2I8Zero; + case Intrinsic::nvvm_suld_2d_v2i16_zero: + return NVPTXISD::Suld2DV2I16Zero; + case Intrinsic::nvvm_suld_2d_v2i32_zero: + return NVPTXISD::Suld2DV2I32Zero; + case Intrinsic::nvvm_suld_2d_v2i64_zero: + return NVPTXISD::Suld2DV2I64Zero; + case Intrinsic::nvvm_suld_2d_v4i8_zero: + return NVPTXISD::Suld2DV4I8Zero; + case Intrinsic::nvvm_suld_2d_v4i16_zero: + return NVPTXISD::Suld2DV4I16Zero; + case Intrinsic::nvvm_suld_2d_v4i32_zero: + return NVPTXISD::Suld2DV4I32Zero; + case Intrinsic::nvvm_suld_2d_array_i8_zero: + return NVPTXISD::Suld2DArrayI8Zero; + case Intrinsic::nvvm_suld_2d_array_i16_zero: + return NVPTXISD::Suld2DArrayI16Zero; + case Intrinsic::nvvm_suld_2d_array_i32_zero: + return NVPTXISD::Suld2DArrayI32Zero; + case Intrinsic::nvvm_suld_2d_array_i64_zero: + return NVPTXISD::Suld2DArrayI64Zero; + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + return NVPTXISD::Suld2DArrayV2I8Zero; + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + return NVPTXISD::Suld2DArrayV2I16Zero; + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + return NVPTXISD::Suld2DArrayV2I32Zero; + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + return NVPTXISD::Suld2DArrayV2I64Zero; + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + return NVPTXISD::Suld2DArrayV4I8Zero; + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + return NVPTXISD::Suld2DArrayV4I16Zero; + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + return NVPTXISD::Suld2DArrayV4I32Zero; + case Intrinsic::nvvm_suld_3d_i8_zero: + return NVPTXISD::Suld3DI8Zero; + case Intrinsic::nvvm_suld_3d_i16_zero: + return NVPTXISD::Suld3DI16Zero; + case Intrinsic::nvvm_suld_3d_i32_zero: + return NVPTXISD::Suld3DI32Zero; + case Intrinsic::nvvm_suld_3d_i64_zero: + return NVPTXISD::Suld3DI64Zero; + case Intrinsic::nvvm_suld_3d_v2i8_zero: + return NVPTXISD::Suld3DV2I8Zero; + case Intrinsic::nvvm_suld_3d_v2i16_zero: + return NVPTXISD::Suld3DV2I16Zero; + case Intrinsic::nvvm_suld_3d_v2i32_zero: + return NVPTXISD::Suld3DV2I32Zero; + case Intrinsic::nvvm_suld_3d_v2i64_zero: + return NVPTXISD::Suld3DV2I64Zero; + case Intrinsic::nvvm_suld_3d_v4i8_zero: + return NVPTXISD::Suld3DV4I8Zero; + case Intrinsic::nvvm_suld_3d_v4i16_zero: + return NVPTXISD::Suld3DV4I16Zero; + case Intrinsic::nvvm_suld_3d_v4i32_zero: + return NVPTXISD::Suld3DV4I32Zero; + } +} + +// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as +// TgtMemIntrinsic +// because we need the information that is only available in the "Value" type +// of destination +// pointer. In particular, the address space information. +bool NVPTXTargetLowering::getTgtMemIntrinsic( + IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const { + switch (Intrinsic) { + default: + return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + Info.opc = ISD::INTRINSIC_W_CHAIN; + // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute + // in order to model data exchange with other threads, but perform no real + // memory accesses. + Info.memVT = MVT::i1; + + // Our result depends on both our and other thread's arguments. + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + case Intrinsic::nvvm_wmma_load_a_f16_col: + case Intrinsic::nvvm_wmma_load_a_f16_row: + case Intrinsic::nvvm_wmma_load_a_f16_col_stride: + case Intrinsic::nvvm_wmma_load_a_f16_row_stride: + case Intrinsic::nvvm_wmma_load_a_f16_col_shared: + case Intrinsic::nvvm_wmma_load_a_f16_row_shared: + case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride: + case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride: + case Intrinsic::nvvm_wmma_load_a_f16_col_global: + case Intrinsic::nvvm_wmma_load_a_f16_row_global: + case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride: + case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride: + case Intrinsic::nvvm_wmma_load_b_f16_col: + case Intrinsic::nvvm_wmma_load_b_f16_row: + case Intrinsic::nvvm_wmma_load_b_f16_col_stride: + case Intrinsic::nvvm_wmma_load_b_f16_row_stride: + case Intrinsic::nvvm_wmma_load_b_f16_col_shared: + case Intrinsic::nvvm_wmma_load_b_f16_row_shared: + case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride: + case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride: + case Intrinsic::nvvm_wmma_load_b_f16_col_global: + case Intrinsic::nvvm_wmma_load_b_f16_row_global: + case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride: + case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v8f16; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_load_c_f16_col: + case Intrinsic::nvvm_wmma_load_c_f16_row: + case Intrinsic::nvvm_wmma_load_c_f16_col_stride: + case Intrinsic::nvvm_wmma_load_c_f16_row_stride: + case Intrinsic::nvvm_wmma_load_c_f16_col_shared: + case Intrinsic::nvvm_wmma_load_c_f16_row_shared: + case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride: + case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride: + case Intrinsic::nvvm_wmma_load_c_f16_col_global: + case Intrinsic::nvvm_wmma_load_c_f16_row_global: + case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride: + case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v4f16; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_load_c_f32_col: + case Intrinsic::nvvm_wmma_load_c_f32_row: + case Intrinsic::nvvm_wmma_load_c_f32_col_stride: + case Intrinsic::nvvm_wmma_load_c_f32_row_stride: + case Intrinsic::nvvm_wmma_load_c_f32_col_shared: + case Intrinsic::nvvm_wmma_load_c_f32_row_shared: + case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride: + case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride: + case Intrinsic::nvvm_wmma_load_c_f32_col_global: + case Intrinsic::nvvm_wmma_load_c_f32_row_global: + case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride: + case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v8f32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_store_d_f16_col: + case Intrinsic::nvvm_wmma_store_d_f16_row: + case Intrinsic::nvvm_wmma_store_d_f16_col_stride: + case Intrinsic::nvvm_wmma_store_d_f16_row_stride: + case Intrinsic::nvvm_wmma_store_d_f16_col_shared: + case Intrinsic::nvvm_wmma_store_d_f16_row_shared: + case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride: + case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride: + case Intrinsic::nvvm_wmma_store_d_f16_col_global: + case Intrinsic::nvvm_wmma_store_d_f16_row_global: + case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride: + case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v4f16; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_wmma_store_d_f32_col: + case Intrinsic::nvvm_wmma_store_d_f32_row: + case Intrinsic::nvvm_wmma_store_d_f32_col_stride: + case Intrinsic::nvvm_wmma_store_d_f32_row_stride: + case Intrinsic::nvvm_wmma_store_d_f32_col_shared: + case Intrinsic::nvvm_wmma_store_d_f32_row_shared: + case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride: + case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride: + case Intrinsic::nvvm_wmma_store_d_f32_col_global: + case Intrinsic::nvvm_wmma_store_d_f32_row_global: + case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride: + case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::v8f32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOStore; + Info.align = 16; + return true; + } + + case Intrinsic::nvvm_atomic_load_add_f32: + case Intrinsic::nvvm_atomic_load_add_f64: + case Intrinsic::nvvm_atomic_load_inc_32: + case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: { + auto &DL = I.getModule()->getDataLayout(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.align = 0; + return true; + } + + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: { + auto &DL = I.getModule()->getDataLayout(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldu_global_i) + Info.memVT = getValueType(DL, I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(DL); + else + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + auto &DL = I.getModule()->getDataLayout(); + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) + Info.memVT = getValueType(DL, I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(DL); + else + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + + return true; + } + + case Intrinsic::nvvm_tex_1d_v4f32_s32: + case Intrinsic::nvvm_tex_1d_v4f32_f32: + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_v4f32_s32: + case Intrinsic::nvvm_tex_2d_v4f32_f32: + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_3d_v4f32_s32: + case Intrinsic::nvvm_tex_3d_v4f32_f32: + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_cube_v4f32_f32: + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4f32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + + case Intrinsic::nvvm_tex_1d_v4s32_s32: + case Intrinsic::nvvm_tex_1d_v4s32_f32: + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_v4s32_s32: + case Intrinsic::nvvm_tex_2d_v4s32_f32: + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_3d_v4s32_s32: + case Intrinsic::nvvm_tex_3d_v4s32_f32: + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4s32_f32: + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4u32_f32: + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_v4u32_s32: + case Intrinsic::nvvm_tex_1d_v4u32_f32: + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_v4u32_s32: + case Intrinsic::nvvm_tex_2d_v4u32_f32: + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_3d_v4u32_s32: + case Intrinsic::nvvm_tex_3d_v4u32_f32: + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i8_clamp: + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_i8_clamp: + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_3d_i8_clamp: + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_i8_trap: + case Intrinsic::nvvm_suld_1d_v2i8_trap: + case Intrinsic::nvvm_suld_1d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_array_i8_trap: + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + case Intrinsic::nvvm_suld_2d_i8_trap: + case Intrinsic::nvvm_suld_2d_v2i8_trap: + case Intrinsic::nvvm_suld_2d_v4i8_trap: + case Intrinsic::nvvm_suld_2d_array_i8_trap: + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + case Intrinsic::nvvm_suld_3d_i8_trap: + case Intrinsic::nvvm_suld_3d_v2i8_trap: + case Intrinsic::nvvm_suld_3d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_i8_zero: + case Intrinsic::nvvm_suld_1d_v2i8_zero: + case Intrinsic::nvvm_suld_1d_v4i8_zero: + case Intrinsic::nvvm_suld_1d_array_i8_zero: + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + case Intrinsic::nvvm_suld_2d_i8_zero: + case Intrinsic::nvvm_suld_2d_v2i8_zero: + case Intrinsic::nvvm_suld_2d_v4i8_zero: + case Intrinsic::nvvm_suld_2d_array_i8_zero: + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + case Intrinsic::nvvm_suld_3d_i8_zero: + case Intrinsic::nvvm_suld_3d_v2i8_zero: + case Intrinsic::nvvm_suld_3d_v4i8_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i8; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i16_clamp: + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_i16_clamp: + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_3d_i16_clamp: + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_i16_trap: + case Intrinsic::nvvm_suld_1d_v2i16_trap: + case Intrinsic::nvvm_suld_1d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_array_i16_trap: + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + case Intrinsic::nvvm_suld_2d_i16_trap: + case Intrinsic::nvvm_suld_2d_v2i16_trap: + case Intrinsic::nvvm_suld_2d_v4i16_trap: + case Intrinsic::nvvm_suld_2d_array_i16_trap: + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + case Intrinsic::nvvm_suld_3d_i16_trap: + case Intrinsic::nvvm_suld_3d_v2i16_trap: + case Intrinsic::nvvm_suld_3d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_i16_zero: + case Intrinsic::nvvm_suld_1d_v2i16_zero: + case Intrinsic::nvvm_suld_1d_v4i16_zero: + case Intrinsic::nvvm_suld_1d_array_i16_zero: + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + case Intrinsic::nvvm_suld_2d_i16_zero: + case Intrinsic::nvvm_suld_2d_v2i16_zero: + case Intrinsic::nvvm_suld_2d_v4i16_zero: + case Intrinsic::nvvm_suld_2d_array_i16_zero: + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + case Intrinsic::nvvm_suld_3d_i16_zero: + case Intrinsic::nvvm_suld_3d_v2i16_zero: + case Intrinsic::nvvm_suld_3d_v4i16_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i16; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i32_clamp: + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_i32_clamp: + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_3d_i32_clamp: + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_i32_trap: + case Intrinsic::nvvm_suld_1d_v2i32_trap: + case Intrinsic::nvvm_suld_1d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_array_i32_trap: + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + case Intrinsic::nvvm_suld_2d_i32_trap: + case Intrinsic::nvvm_suld_2d_v2i32_trap: + case Intrinsic::nvvm_suld_2d_v4i32_trap: + case Intrinsic::nvvm_suld_2d_array_i32_trap: + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + case Intrinsic::nvvm_suld_3d_i32_trap: + case Intrinsic::nvvm_suld_3d_v2i32_trap: + case Intrinsic::nvvm_suld_3d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_i32_zero: + case Intrinsic::nvvm_suld_1d_v2i32_zero: + case Intrinsic::nvvm_suld_1d_v4i32_zero: + case Intrinsic::nvvm_suld_1d_array_i32_zero: + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + case Intrinsic::nvvm_suld_2d_i32_zero: + case Intrinsic::nvvm_suld_2d_v2i32_zero: + case Intrinsic::nvvm_suld_2d_v4i32_zero: + case Intrinsic::nvvm_suld_2d_array_i32_zero: + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + case Intrinsic::nvvm_suld_3d_i32_zero: + case Intrinsic::nvvm_suld_3d_v2i32_zero: + case Intrinsic::nvvm_suld_3d_v4i32_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i64_clamp: + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_i64_clamp: + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_3d_i64_clamp: + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_i64_trap: + case Intrinsic::nvvm_suld_1d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_array_i64_trap: + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + case Intrinsic::nvvm_suld_2d_i64_trap: + case Intrinsic::nvvm_suld_2d_v2i64_trap: + case Intrinsic::nvvm_suld_2d_array_i64_trap: + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + case Intrinsic::nvvm_suld_3d_i64_trap: + case Intrinsic::nvvm_suld_3d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_i64_zero: + case Intrinsic::nvvm_suld_1d_v2i64_zero: + case Intrinsic::nvvm_suld_1d_array_i64_zero: + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + case Intrinsic::nvvm_suld_2d_i64_zero: + case Intrinsic::nvvm_suld_2d_v2i64_zero: + case Intrinsic::nvvm_suld_2d_array_i64_zero: + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + case Intrinsic::nvvm_suld_3d_i64_zero: + case Intrinsic::nvvm_suld_3d_v2i64_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i64; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad; + Info.align = 16; + return true; + } + return false; +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +/// Used to guide target specific optimizations, like loop strength reduction +/// (LoopStrengthReduce.cpp) and memory optimization for address mode +/// (CodeGenPrepare.cpp) +bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS, Instruction *I) const { + // AddrMode - This represents an addressing mode of: + // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + // + // The legal address modes are + // - [avar] + // - [areg] + // - [areg+immoff] + // - [immAddr] + + if (AM.BaseGV) { + return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; + } + + switch (AM.Scale) { + case 0: // "r", "r+i" or "i" is allowed + break; + case 1: + if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. + return false; + // Otherwise we have r+i. + break; + default: + // No scale > 1 is allowed + return false; + } + return true; +} + +//===----------------------------------------------------------------------===// +// NVPTX Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +NVPTXTargetLowering::ConstraintType +NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'b': + case 'r': + case 'h': + case 'c': + case 'l': + case 'f': + case 'd': + case '0': + case 'N': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair<unsigned, const TargetRegisterClass *> +NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); + case 'c': + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); + case 'h': + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); + case 'r': + return std::make_pair(0U, &NVPTX::Int32RegsRegClass); + case 'l': + case 'N': + return std::make_pair(0U, &NVPTX::Int64RegsRegClass); + case 'f': + return std::make_pair(0U, &NVPTX::Float32RegsRegClass); + case 'd': + return std::make_pair(0U, &NVPTX::Float64RegsRegClass); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, + CodeGenOpt::Level OptLevel) const { + // Always honor command-line argument + if (FMAContractLevelOpt.getNumOccurrences() > 0) + return FMAContractLevelOpt > 0; + + // Do not contract if we're not optimizing the code. + if (OptLevel == 0) + return false; + + // Honor TargetOptions flags that explicitly say fusion is okay. + if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) + return true; + + return allowUnsafeFPMath(MF); +} + +bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { + // Honor TargetOptions flags that explicitly say unsafe math is okay. + if (MF.getTarget().Options.UnsafeFPMath) + return true; + + // Allow unsafe math if unsafe-fp-math attribute explicitly says so. + const Function &F = MF.getFunction(); + if (F.hasFnAttribute("unsafe-fp-math")) { + Attribute Attr = F.getFnAttribute("unsafe-fp-math"); + StringRef Val = Attr.getValueAsString(); + if (Val == "true") + return true; + } + + return false; +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + const auto *TLI = static_cast<const NVPTXTargetLowering *>( + &DAG.getTargetLoweringInfo()); + if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + if (SDValue Result = + PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa<ConstantSDNode>(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast<MemSDNode>(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != nullptr) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +static SDValue PerformREMCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); + + // Don't do anything at less than -O2. + if (OptLevel < CodeGenOpt::Default) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + EVT VT = N->getValueType(0); + bool IsSigned = N->getOpcode() == ISD::SREM; + unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; + + const SDValue &Num = N->getOperand(0); + const SDValue &Den = N->getOperand(1); + + for (const SDNode *U : Num->uses()) { + if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && + U->getOperand(1) == Den) { + // Num % Den -> Num - (Num / Den) * Den + return DAG.getNode(ISD::SUB, DL, VT, Num, + DAG.getNode(ISD::MUL, DL, VT, + DAG.getNode(DivOpc, DL, VT, Num, Den), + Den)); + } + } + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() <= OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() <= OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { + const APInt &Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + return Val.isIntN(OptSize); + } else { + return Val.isSignedIntN(OptSize); + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + return LHSSign == RHSSign; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + SDLoc DL(N); + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa<ConstantSDNode>(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + if (SDValue Ret = TryMULWIDECombine(N, DCI)) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + if (SDValue Ret = TryMULWIDECombine(N, DCI)) + return Ret; + } + + return SDValue(); +} + +static SDValue PerformSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT CCType = N->getValueType(0); + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + + if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) + return SDValue(); + + SDLoc DL(N); + // setp.f16x2 returns two scalar predicates, which we need to + // convert back to v2i1. The returned result will be scalarized by + // the legalizer, but the comparison will remain a single vector + // instruction. + SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, + DCI.DAG.getVTList(MVT::i1, MVT::i1), + {A, B, N->getOperand(2)}); + return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), + CCNode.getValue(1)); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, STI, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::UREM: + case ISD::SREM: + return PerformREMCombine(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI); + } + return SDValue(); +} + +/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. +static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) { + EVT ResVT = N->getValueType(0); + SDLoc DL(N); + + assert(ResVT.isVector() && "Vector load must have vector type"); + + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 loads of <2 x double> here + // but I'm leaving that as a TODO for now. + assert(ResVT.isSimple() && "Can only handle simple types"); + switch (ResVT.getSimpleVT().SimpleTy) { + default: + return; + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f16: + case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> + // This is a "native" vector type + break; + } + + LoadSDNode *LD = cast<LoadSDNode>(N); + + unsigned Align = LD->getAlignment(); + auto &TD = DAG.getDataLayout(); + unsigned PrefAlign = + TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return; + } + + EVT EltVT = ResVT.getVectorElementType(); + unsigned NumElts = ResVT.getVectorNumElements(); + + // Since LoadV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propagate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + bool LoadF16x2 = false; + + switch (NumElts) { + default: + return; + case 2: + Opcode = NVPTXISD::LoadV2; + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + case 8: { + // v8f16 is a special case. PTX doesn't have ld.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // load them with ld.v4.b32. + assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); + LoadF16x2 = true; + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, + MVT::Other}; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + } + + // Copy regular operands + SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); + + // The select routine does not have access to the LoadSDNode instance, so + // pass along the extension information + OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + LD->getMemoryVT(), + LD->getMemOperand()); + + SmallVector<SDValue, 8> ScalarRes; + if (LoadF16x2) { + // Split v2f16 subvectors back into individual elements. + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue SubVector = NewLD.getValue(i); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(0, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(1, DL)); + ScalarRes.push_back(E0); + ScalarRes.push_back(E1); + } + } else { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); +} + +static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Intrin = N->getOperand(1); + SDLoc DL(N); + + // Get the intrinsic ID + unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: { + EVT ResVT = N->getValueType(0); + + if (ResVT.isVector()) { + // Vector LDG/LDU + + unsigned NumElts = ResVT.getVectorNumElements(); + EVT EltVT = ResVT.getVectorElementType(); + + // Since LDU/LDG are target nodes, we cannot rely on DAG type + // legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propagate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + + switch (NumElts) { + default: + return; + case 2: + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV2; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV2; + break; + } + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV4; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV4; + break; + } + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + } + + SmallVector<SDValue, 8> OtherOps; + + // Copy regular operands + + OtherOps.push_back(Chain); // Chain + // Skip operand 1 (intrinsic ID) + // Others + OtherOps.append(N->op_begin() + 2, N->op_end()); + + MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + MemSD->getMemoryVT(), + MemSD->getMemOperand()); + + SmallVector<SDValue, 4> ScalarRes; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = + DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = + DAG.getBuildVector(ResVT, DL, ScalarRes); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); + } else { + // i8 LDG/LDU + assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && + "Custom handling of non-i8 ldu/ldg?"); + + // Just copy all operands as-is + SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); + + // Force output to i16 + SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); + + MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); + + // We make sure the memory type is i8, which will be used during isel + // to select the proper instruction. + SDValue NewLD = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, + MVT::i8, MemSD->getMemOperand()); + + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + NewLD.getValue(0))); + Results.push_back(NewLD.getValue(1)); + } + } + } +} + +void NVPTXTargetLowering::ReplaceNodeResults( + SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + report_fatal_error("Unhandled custom legalization"); + case ISD::LOAD: + ReplaceLoadVector(N, DAG, Results); + return; + case ISD::INTRINSIC_W_CHAIN: + ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); + return; + } +} + +// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file. +void NVPTXSection::anchor() {} + +NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { + delete static_cast<NVPTXSection *>(TextSection); + delete static_cast<NVPTXSection *>(DataSection); + delete static_cast<NVPTXSection *>(BSSSection); + delete static_cast<NVPTXSection *>(ReadOnlySection); + + delete static_cast<NVPTXSection *>(StaticCtorSection); + delete static_cast<NVPTXSection *>(StaticDtorSection); + delete static_cast<NVPTXSection *>(LSDASection); + delete static_cast<NVPTXSection *>(EHFrameSection); + delete static_cast<NVPTXSection *>(DwarfAbbrevSection); + delete static_cast<NVPTXSection *>(DwarfInfoSection); + delete static_cast<NVPTXSection *>(DwarfLineSection); + delete static_cast<NVPTXSection *>(DwarfFrameSection); + delete static_cast<NVPTXSection *>(DwarfPubTypesSection); + delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection); + delete static_cast<NVPTXSection *>(DwarfStrSection); + delete static_cast<NVPTXSection *>(DwarfLocSection); + delete static_cast<NVPTXSection *>(DwarfARangesSection); + delete static_cast<NVPTXSection *>(DwarfRangesSection); + delete static_cast<NVPTXSection *>(DwarfMacinfoSection); +} + +MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + return getDataSection(); +} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 9d7b70d80c11..ef04a8573d45 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -17,7 +17,7 @@ #include "NVPTX.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { namespace NVPTXISD { @@ -448,6 +448,7 @@ public: const char *getTargetNodeName(unsigned Opcode) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const override; /// isLegalAddressingMode - Return true if the addressing mode represented @@ -456,7 +457,8 @@ public: /// reduction (LoopStrengthReduce.cpp) and memory optimization for /// address mode (CodeGenPrepare.cpp) bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const override; + unsigned AS, + Instruction *I = nullptr) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override { // Truncating 64-bit to 32-bit is free in SASS. @@ -490,7 +492,7 @@ public: std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl<ISD::OutputArg> &, unsigned retAlignment, - const ImmutableCallSite *CS) const; + ImmutableCallSite CS) const; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -570,9 +572,8 @@ private: SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, - Type *Ty, unsigned Idx, - const DataLayout &DL) const; + unsigned getArgumentAlignment(SDValue Callee, ImmutableCallSite CS, Type *Ty, + unsigned Idx, const DataLayout &DL) const; }; } // namespace llvm diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index d284282e28c5..18ba7684ae51 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -16,7 +16,7 @@ #include "NVPTX.h" #include "NVPTXRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "NVPTXGenInstrInfo.inc" diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b5b5ea1ed639..92152a64e525 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1,3165 +1,3169 @@ -//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-include "NVPTXInstrFormats.td"
-
-// A NOP instruction
-let hasSideEffects = 0 in {
- def NOP : NVPTXInst<(outs), (ins), "", []>;
-}
-
-let OperandType = "OPERAND_IMMEDIATE" in {
- def f16imm : Operand<f16>;
-}
-
-// List of vector specific properties
-def isVecLD : VecInstTypeEnum<1>;
-def isVecST : VecInstTypeEnum<2>;
-def isVecBuild : VecInstTypeEnum<3>;
-def isVecShuffle : VecInstTypeEnum<4>;
-def isVecExtract : VecInstTypeEnum<5>;
-def isVecInsert : VecInstTypeEnum<6>;
-def isVecDest : VecInstTypeEnum<7>;
-def isVecOther : VecInstTypeEnum<15>;
-
-//===----------------------------------------------------------------------===//
-// NVPTX Operand Definitions.
-//===----------------------------------------------------------------------===//
-
-def brtarget : Operand<OtherVT>;
-
-// CVT conversion modes
-// These must match the enum in NVPTX.h
-def CvtNONE : PatLeaf<(i32 0x0)>;
-def CvtRNI : PatLeaf<(i32 0x1)>;
-def CvtRZI : PatLeaf<(i32 0x2)>;
-def CvtRMI : PatLeaf<(i32 0x3)>;
-def CvtRPI : PatLeaf<(i32 0x4)>;
-def CvtRN : PatLeaf<(i32 0x5)>;
-def CvtRZ : PatLeaf<(i32 0x6)>;
-def CvtRM : PatLeaf<(i32 0x7)>;
-def CvtRP : PatLeaf<(i32 0x8)>;
-
-def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
-def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
-def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
-def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
-def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
-def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
-def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
-def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
-def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
-
-def CvtSAT : PatLeaf<(i32 0x20)>;
-def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
-
-def CvtMode : Operand<i32> {
- let PrintMethod = "printCvtMode";
-}
-
-// Compare modes
-// These must match the enum in NVPTX.h
-def CmpEQ : PatLeaf<(i32 0)>;
-def CmpNE : PatLeaf<(i32 1)>;
-def CmpLT : PatLeaf<(i32 2)>;
-def CmpLE : PatLeaf<(i32 3)>;
-def CmpGT : PatLeaf<(i32 4)>;
-def CmpGE : PatLeaf<(i32 5)>;
-def CmpEQU : PatLeaf<(i32 10)>;
-def CmpNEU : PatLeaf<(i32 11)>;
-def CmpLTU : PatLeaf<(i32 12)>;
-def CmpLEU : PatLeaf<(i32 13)>;
-def CmpGTU : PatLeaf<(i32 14)>;
-def CmpGEU : PatLeaf<(i32 15)>;
-def CmpNUM : PatLeaf<(i32 16)>;
-def CmpNAN : PatLeaf<(i32 17)>;
-
-def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
-def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
-def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
-def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
-def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
-def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
-def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
-def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
-def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
-def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
-def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
-def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
-def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
-def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
-
-def CmpMode : Operand<i32> {
- let PrintMethod = "printCmpMode";
-}
-def VecElement : Operand<i32> {
- let PrintMethod = "printVecElement";
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX Instruction Predicate Definitions
-//===----------------------------------------------------------------------===//
-
-
-def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
-def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
-def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
-def useAtomRedG32forGen32 :
- Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
-def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
-def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
-def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
-def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
-def useAtomRedG64forGen64 :
- Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
-def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
-def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
-def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
-def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
-def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
-def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
-def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
-
-def doF32FTZ : Predicate<"useF32FTZ()">;
-def doNoF32FTZ : Predicate<"!useF32FTZ()">;
-
-def doMulWide : Predicate<"doMulWide">;
-
-def allowFMA : Predicate<"allowFMA()">;
-def noFMA : Predicate<"!allowFMA()">;
-def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
-
-def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
-def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
-
-def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
-def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
-
-def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
-def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
-
-def true : Predicate<"true">;
-
-def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
-
-def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
-
-//===----------------------------------------------------------------------===//
-// Some Common Instruction Class Templates
-//===----------------------------------------------------------------------===//
-
-// Template for instructions which take three int64, int32, or int16 args.
-// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
-multiclass I3<string OpcStr, SDNode OpNode> {
- def i64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
- def i64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
- def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
- def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
- def i16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
- def i16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
-}
-
-// Template for instructions which take 3 int32 args. The instructions are
-// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
-multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
- def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
- def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-}
-
-// Template for instructions which take three fp64 or fp32 args. The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
-//
-// Also defines ftz (flush subnormal inputs and results to sign-preserving
-// zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that cannot be folded into FMAs.
-// For nodes that can be folded into FMAs (i.e. adds and muls), use
-// F3_fma_component.
-multiclass F3<string OpcStr, SDNode OpNode> {
- def f64rr :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
- def f64ri :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
- def f32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[doF32FTZ]>;
- def f32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[doF32FTZ]>;
- def f32rr :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
- def f32ri :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
-}
-
-// Template for instructions which take three FP args. The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
-//
-// Also defines ftz (flush subnormal inputs and results to sign-preserving
-// zero) variants for fp32/fp16 functions.
-//
-// This multiclass should be used for nodes that can be folded to make fma ops.
-// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
-// just like the non ".rn" op, but prevents ptxas from creating FMAs.
-multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
- def f64rr :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
- Requires<[allowFMA]>;
- def f64ri :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA]>;
- def f32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[allowFMA, doF32FTZ]>;
- def f32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA, doF32FTZ]>;
- def f32rr :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[allowFMA]>;
- def f32ri :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA]>;
-
- def f16rr_ftz :
- NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b),
- !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
- [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
- Requires<[useFP16Math, allowFMA, doF32FTZ]>;
- def f16rr :
- NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b),
- !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
- [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
- Requires<[useFP16Math, allowFMA]>;
-
- def f16x2rr_ftz :
- NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16x2Regs:$a, Float16x2Regs:$b),
- !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
- [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
- Requires<[useFP16Math, allowFMA, doF32FTZ]>;
- def f16x2rr :
- NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16x2Regs:$a, Float16x2Regs:$b),
- !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
- [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
- Requires<[useFP16Math, allowFMA]>;
-
- // These have strange names so we don't perturb existing mir tests.
- def _rnf64rr :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
- Requires<[noFMA]>;
- def _rnf64ri :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
- Requires<[noFMA]>;
- def _rnf32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[noFMA, doF32FTZ]>;
- def _rnf32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[noFMA, doF32FTZ]>;
- def _rnf32rr :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[noFMA]>;
- def _rnf32ri :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[noFMA]>;
- def _rnf16rr_ftz :
- NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b),
- !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
- [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
- Requires<[useFP16Math, noFMA, doF32FTZ]>;
- def _rnf16rr :
- NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b),
- !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
- [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
- Requires<[useFP16Math, noFMA]>;
- def _rnf16x2rr_ftz :
- NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16x2Regs:$a, Float16x2Regs:$b),
- !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
- [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
- Requires<[useFP16Math, noFMA, doF32FTZ]>;
- def _rnf16x2rr :
- NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16x2Regs:$a, Float16x2Regs:$b),
- !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
- [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
- Requires<[useFP16Math, noFMA]>;
-}
-
-// Template for operations which take two f32 or f64 operands. Provides three
-// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
-// subnormal inputs and results to zero).
-multiclass F2<string OpcStr, SDNode OpNode> {
- def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
- !strconcat(OpcStr, ".f64 \t$dst, $a;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
- def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
- Requires<[doF32FTZ]>;
- def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX Instructions.
-//===----------------------------------------------------------------------===//
-
-//-----------------------------------
-// Type Conversion
-//-----------------------------------
-
-let hasSideEffects = 0 in {
- // Generate a cvt to the given type from all possible types. Each instance
- // takes a CvtMode immediate that defines the conversion mode to use. It can
- // be CvtNONE to omit a conversion mode.
- multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
- def _s8 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s8 \t$dst, $src;"), []>;
- def _u8 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u8 \t$dst, $src;"), []>;
- def _s16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s16 \t$dst, $src;"), []>;
- def _u16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u16 \t$dst, $src;"), []>;
- def _s32 :
- NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s32 \t$dst, $src;"), []>;
- def _u32 :
- NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u32 \t$dst, $src;"), []>;
- def _s64 :
- NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".s64 \t$dst, $src;"), []>;
- def _u64 :
- NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".u64 \t$dst, $src;"), []>;
- def _f16 :
- NVPTXInst<(outs RC:$dst),
- (ins Float16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f16 \t$dst, $src;"), []>;
- def _f32 :
- NVPTXInst<(outs RC:$dst),
- (ins Float32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f32 \t$dst, $src;"), []>;
- def _f64 :
- NVPTXInst<(outs RC:$dst),
- (ins Float64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- FromName, ".f64 \t$dst, $src;"), []>;
- }
-
- // Generate cvts from all types to all types.
- defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
- defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
- defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
- defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
- defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
- defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
- defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
- defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
- defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
- defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
- defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
-
- // These cvts are different from those above: The source and dest registers
- // are of the same type.
- def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "cvt.s16.s8 \t$dst, $src;", []>;
- def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s8 \t$dst, $src;", []>;
- def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s16 \t$dst, $src;", []>;
- def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s8 \t$dst, $src;", []>;
- def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s16 \t$dst, $src;", []>;
- def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s32 \t$dst, $src;", []>;
-}
-
-//-----------------------------------
-// Integer Arithmetic
-//-----------------------------------
-
-// Template for xor masquerading as int1 arithmetic.
-multiclass ADD_SUB_i1<SDNode OpNode> {
- def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- "xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
- def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- "xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
-}
-
-// int1 addition and subtraction are both just xor.
-defm ADD_i1 : ADD_SUB_i1<add>;
-defm SUB_i1 : ADD_SUB_i1<sub>;
-
-// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
-// also use these for unsigned arithmetic.
-defm ADD : I3<"add.s", add>;
-defm SUB : I3<"sub.s", sub>;
-
-// int32 addition and subtraction with carry-out.
-// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
-defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
-defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
-
-// int32 addition and subtraction with carry-in and carry-out.
-defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
-defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
-
-defm MULT : I3<"mul.lo.s", mul>;
-
-defm MULTHS : I3<"mul.hi.s", mulhs>;
-defm MULTHU : I3<"mul.hi.u", mulhu>;
-
-defm SDIV : I3<"div.s", sdiv>;
-defm UDIV : I3<"div.u", udiv>;
-
-// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
-// will lower it.
-defm SREM : I3<"rem.s", srem>;
-defm UREM : I3<"rem.u", urem>;
-
-// Integer absolute value. NumBits should be one minus the bit width of RC.
-// This idiom implements the algorithm at
-// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
-multiclass ABS<RegisterClass RC, string SizeName> {
- def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
- !strconcat("abs", SizeName, " \t$dst, $a;"),
- [(set RC:$dst, (abs RC:$a))]>;
-}
-defm ABS_16 : ABS<Int16Regs, ".s16">;
-defm ABS_32 : ABS<Int32Regs, ".s32">;
-defm ABS_64 : ABS<Int64Regs, ".s64">;
-
-// Integer min/max.
-defm SMAX : I3<"max.s", smax>;
-defm UMAX : I3<"max.u", umax>;
-defm SMIN : I3<"min.s", smin>;
-defm UMIN : I3<"min.u", umin>;
-
-//
-// Wide multiplication
-//
-def MULWIDES64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
-
-def MULWIDEU64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
-
-def MULWIDES32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
-
-def MULWIDEU32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
-
-def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
-
-// Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
- (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
- (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
- (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
- (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
- (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
- (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
- Requires<[doMulWide]>;
-
-// Predicates used for converting some patterns to mul.wide.
-def SInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(32);
-}]>;
-
-def UInt32Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(32);
-}]>;
-
-def SInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isSignedIntN(16);
-}]>;
-
-def UInt16Const : PatLeaf<(imm), [{
- const APInt &v = N->getAPIntValue();
- return v.isIntN(16);
-}]>;
-
-def Int5Const : PatLeaf<(imm), [{
- // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(32);
-}]>;
-
-def Int4Const : PatLeaf<(imm), [{
- // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
- const APInt &v = N->getAPIntValue();
- return v.sge(0) && v.slt(16);
-}]>;
-
-def SHL2MUL32 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(32, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
-}]>;
-
-def SHL2MUL16 : SDNodeXForm<imm, [{
- const APInt &v = N->getAPIntValue();
- APInt temp(16, 1);
- return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
-}]>;
-
-// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
- (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
- Requires<[doMulWide]>;
-def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
- (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
- Requires<[doMulWide]>;
-
-def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
- (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
- Requires<[doMulWide]>;
-def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
- (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
- Requires<[doMulWide]>;
-
-// Convert "sign/zero-extend then multiply" to mul.wide.
-def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
- (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
- Requires<[doMulWide]>;
-
-def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
- Requires<[doMulWide]>;
-
-def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
- (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
- Requires<[doMulWide]>;
-
-def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
- Requires<[doMulWide]>;
-def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
- Requires<[doMulWide]>;
-
-//
-// Integer multiply-add
-//
-def SDTIMAD :
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
- SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
-def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
-def MAD16rri :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
-def MAD16rir :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
-def MAD16rii :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
-def MAD32rri :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
-def MAD32rir :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
-def MAD32rii :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD64rrr :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
-def MAD64rri :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
-def MAD64rir :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
-def MAD64rii :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
-
-def INEG16 :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "neg.s16 \t$dst, $src;",
- [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
-def INEG32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "neg.s32 \t$dst, $src;",
- [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
-def INEG64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "neg.s64 \t$dst, $src;",
- [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
-
-//-----------------------------------
-// Floating Point Arithmetic
-//-----------------------------------
-
-// Constant 1.0f
-def FloatConst1 : PatLeaf<(fpimm), [{
- return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
- N->getValueAPF().convertToFloat() == 1.0f;
-}]>;
-// Constant 1.0 (double)
-def DoubleConst1 : PatLeaf<(fpimm), [{
- return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
- N->getValueAPF().convertToDouble() == 1.0;
-}]>;
-
-// Loads FP16 constant into a register.
-//
-// ptxas does not have hex representation for fp16, so we can't use
-// fp16 immediate values in .f16 instructions. Instead we have to load
-// the constant into a register using mov.b16.
-def LOAD_CONST_F16 :
- NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
- "mov.b16 \t$dst, $a;", []>;
-
-defm FADD : F3_fma_component<"add", fadd>;
-defm FSUB : F3_fma_component<"sub", fsub>;
-defm FMUL : F3_fma_component<"mul", fmul>;
-
-defm FMIN : F3<"min", fminnum>;
-defm FMAX : F3<"max", fmaxnum>;
-
-defm FABS : F2<"abs", fabs>;
-defm FNEG : F2<"neg", fneg>;
-defm FSQRT : F2<"sqrt.rn", fsqrt>;
-
-//
-// F64 division
-//
-def FDIV641r :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins f64imm:$a, Float64Regs:$b),
- "rcp.rn.f64 \t$dst, $b;",
- [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
-def FDIV64rr :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, Float64Regs:$b),
- "div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
-def FDIV64ri :
- NVPTXInst<(outs Float64Regs:$dst),
- (ins Float64Regs:$a, f64imm:$b),
- "div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
-
-//
-// F32 Approximate reciprocal
-//
-def FDIV321r_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
-//
-// F32 Approximate division
-//
-def FDIV32approxrr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxrr :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
-def FDIV32approxri :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_APPROX]>;
-//
-// F32 Semi-accurate reciprocal
-//
-// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
-//
-def FDIV321r_approx_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL]>;
-//
-// F32 Semi-accurate division
-//
-def FDIV32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32rr :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[do_DIVF32_FULL]>;
-def FDIV32ri :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[do_DIVF32_FULL]>;
-//
-// F32 Accurate reciprocal
-//
-def FDIV321r_prec_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.rn.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20, doF32FTZ]>;
-def FDIV321r_prec :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins f32imm:$a, Float32Regs:$b),
- "rcp.rn.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
-//
-// F32 Accurate division
-//
-def FDIV32rr_prec_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32ri_prec_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32rr_prec :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, Float32Regs:$b),
- "div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[reqPTX20]>;
-def FDIV32ri_prec :
- NVPTXInst<(outs Float32Regs:$dst),
- (ins Float32Regs:$a, f32imm:$b),
- "div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
- Requires<[reqPTX20]>;
-
-//
-// FMA
-//
-
-multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
- def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
- Requires<[Pred]>;
- def rri : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, ImmCls:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
- Requires<[Pred]>;
- def rir : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, RC:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
- Requires<[Pred]>;
- def rii : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, ImmCls:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
- Requires<[Pred]>;
-}
-
-multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
- def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
- Requires<[useFP16Math, Pred]>;
-}
-
-defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
-defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>;
-defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
-defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
-defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
-defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
-defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
-
-// sin/cos
-def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "sin.approx.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
- Requires<[allowUnsafeFPMath]>;
-def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "cos.approx.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
- Requires<[allowUnsafeFPMath]>;
-
-// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
-// i.e. "poor man's fmod()"
-
-// frem - f32 FTZ
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
- (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
- Float32Regs:$y))>,
- Requires<[doF32FTZ]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
- (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
- fpimm:$y))>,
- Requires<[doF32FTZ]>;
-
-// frem - f32
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
- (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
- Float32Regs:$y))>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
- (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
- fpimm:$y))>;
-
-// frem - f64
-def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
- (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
- (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
- Float64Regs:$y))>;
-def : Pat<(frem Float64Regs:$x, fpimm:$y),
- (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
- (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
- fpimm:$y))>;
-
-//-----------------------------------
-// Bitwise operations
-//-----------------------------------
-
-// Template for three-arg bitwise operations. Takes three args, Creates .b16,
-// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
-multiclass BITWISE<string OpcStr, SDNode OpNode> {
- def b1rr :
- NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
- def b1ri :
- NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
- def b16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
- def b16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
- def b32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
- def b32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
- def b64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
- def b64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-}
-
-defm OR : BITWISE<"or", or>;
-defm AND : BITWISE<"and", and>;
-defm XOR : BITWISE<"xor", xor>;
-
-def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
- "not.pred \t$dst, $src;",
- [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
-def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "not.b16 \t$dst, $src;",
- [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
-def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "not.b32 \t$dst, $src;",
- [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
-def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "not.b64 \t$dst, $src;",
- [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
-
-// Template for left/right shifts. Takes three operands,
-// [dest (reg), src (reg), shift (reg or imm)].
-// dest and src may be int64, int32, or int16, but shift is always int32.
-//
-// This template also defines a 32-bit shift (imm, imm) instruction.
-multiclass SHIFT<string OpcStr, SDNode OpNode> {
- def i64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
- def i64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
- def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
- def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
- def i32ii :
- NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
- def i16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
- def i16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
-}
-
-defm SHL : SHIFT<"shl.b", shl>;
-defm SRA : SHIFT<"shr.s", sra>;
-defm SRL : SHIFT<"shr.u", srl>;
-
-// Bit-reverse
-def BREV32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- "brev.b32 \t$dst, $a;",
- [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
-def BREV64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
- "brev.b64 \t$dst, $a;",
- [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
-
-//
-// Rotate: Use ptx shf instruction if available.
-//
-
-// 32 bit r2 = rotl r1, n
-// =>
-// r2 = shf.l r1, r1, n
-def ROTL32imm_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-def ROTL32reg_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[hasHWROT32]>;
-
-// 32 bit r2 = rotr r1, n
-// =>
-// r2 = shf.r r1, r1, n
-def ROTR32imm_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[hasHWROT32]>;
-
-// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
-def ROT32imm_sw :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- "shl.b32 \t%lhs, $src, $amt1;\n\t"
- "shr.b32 \t%rhs, $src, $amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- []>;
-
-def SUB_FRM_32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
- Requires<[noHWROT32]>;
-def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
- Requires<[noHWROT32]>;
-
-// 32-bit software rotate left by register.
-def ROTL32reg_sw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- ".reg .b32 %amt2;\n\t"
- "shl.b32 \t%lhs, $src, $amt;\n\t"
- "sub.s32 \t%amt2, 32, $amt;\n\t"
- "shr.b32 \t%rhs, $src, %amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[noHWROT32]>;
-
-// 32-bit software rotate right by register.
-def ROTR32reg_sw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- ".reg .b32 %amt2;\n\t"
- "shr.b32 \t%lhs, $src, $amt;\n\t"
- "sub.s32 \t%amt2, 32, $amt;\n\t"
- "shl.b32 \t%rhs, $src, %amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
- Requires<[noHWROT32]>;
-
-// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
-def ROT64imm_sw :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- "shl.b64 \t%lhs, $src, $amt1;\n\t"
- "shr.b64 \t%rhs, $src, $amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- []>;
-
-def SUB_FRM_64 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
- (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
-def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
- (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
-
-// 64-bit software rotate left by register.
-def ROTL64reg_sw :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- ".reg .u32 %amt2;\n\t"
- "shl.b64 \t%lhs, $src, $amt;\n\t"
- "sub.u32 \t%amt2, 64, $amt;\n\t"
- "shr.b64 \t%rhs, $src, %amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
-
-def ROTR64reg_sw :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- ".reg .u32 %amt2;\n\t"
- "shr.b64 \t%lhs, $src, $amt;\n\t"
- "sub.u32 \t%amt2, 64, $amt;\n\t"
- "shl.b64 \t%rhs, $src, %amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
-
-//
-// Funnnel shift in clamp mode
-//
-
-// Create SDNodes so they can be used in the DAG code, e.g.
-// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def SDTIntShiftDOp :
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
- SDTCisInt<0>, SDTCisInt<3>]>;
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
-
-def FUNSHFRCLAMP :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
-
-//
-// BFE - bit-field extract
-//
-
-// Template for BFE instructions. Takes four args,
-// [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
-// Start may be an imm only if end is also an imm. FIXME: Is this a
-// restriction in PTX?
-//
-// dest and src may be int32 or int64, but start and end are always int32.
-multiclass BFE<string TyStr, RegisterClass RC> {
- def rrr
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
- !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
- def rri
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, i32imm:$c),
- !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
- def rii
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, i32imm:$c),
- !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
-}
-
-let hasSideEffects = 0 in {
- defm BFE_S32 : BFE<"s32", Int32Regs>;
- defm BFE_U32 : BFE<"u32", Int32Regs>;
- defm BFE_S64 : BFE<"s64", Int64Regs>;
- defm BFE_U64 : BFE<"u64", Int64Regs>;
-}
-
-//-----------------------------------
-// Comparison instructions (setp, set)
-//-----------------------------------
-
-// FIXME: This doesn't cover versions of set and setp that combine with a
-// boolean predicate, e.g. setp.eq.and.b16.
-
-let hasSideEffects = 0 in {
- multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr :
- NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
- def ri :
- NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
- def ir :
- NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
- }
-}
-
-defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
-defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
-defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
-defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
-defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
-defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
-defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
-defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
-defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
-defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
-defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
-def SETP_f16rr :
- NVPTXInst<(outs Int1Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
- []>, Requires<[useFP16Math]>;
-
-def SETP_f16x2rr :
- NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
- (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
- []>,
- Requires<[useFP16Math]>;
-
-
-// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
-// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
-// reg, either u32, s32, or f32. Anyway these aren't used at the moment.
-
-let hasSideEffects = 0 in {
- multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr : NVPTXInst<(outs Int32Regs:$dst),
- (ins RC:$a, RC:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
- def ri : NVPTXInst<(outs Int32Regs:$dst),
- (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
- def ir : NVPTXInst<(outs Int32Regs:$dst),
- (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
- !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
- }
-}
-
-defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
-defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
-defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
-defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
-defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
-defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
-defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
-defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
-defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
-defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
-defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
-defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
-
-//-----------------------------------
-// Selection instructions (selp)
-//-----------------------------------
-
-// FIXME: Missing slct
-
-// selp instructions that don't have any pattern matches; we explicitly use
-// them within this file.
-let hasSideEffects = 0 in {
- multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
- def rr : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
- def ri : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
- def ir : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
- def ii : NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
- }
-
- multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
- SDNode ImmNode> {
- def rr :
- NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
- def ri :
- NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
- def ir :
- NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, RC:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
- def ii :
- NVPTXInst<(outs RC:$dst),
- (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
- !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
- }
-}
-
-// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
-// good.
-defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
-defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
-defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
-defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
-defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
-defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
-defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
-defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
-defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
-defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
-defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
-defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
-
-def SELP_f16x2rr :
- NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
- "selp.b32 \t$dst, $a, $b, $p;",
- [(set Float16x2Regs:$dst,
- (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
-
-//-----------------------------------
-// Data Movement (Load / Store, Move)
-//-----------------------------------
-
-def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
- [SDNPWantRoot]>;
-def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
- [SDNPWantRoot]>;
-
-def MEMri : Operand<i32> {
- let PrintMethod = "printMemOperand";
- let MIOperandInfo = (ops Int32Regs, i32imm);
-}
-def MEMri64 : Operand<i64> {
- let PrintMethod = "printMemOperand";
- let MIOperandInfo = (ops Int64Regs, i64imm);
-}
-
-def imem : Operand<iPTR> {
- let PrintMethod = "printOperand";
-}
-
-def imemAny : Operand<iPTRAny> {
- let PrintMethod = "printOperand";
-}
-
-def LdStCode : Operand<i32> {
- let PrintMethod = "printLdStCode";
-}
-
-def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
-def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
-
-// Load a memory address into a u32 or u64 register.
-def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
- "mov.u32 \t$dst, $a;",
- [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
- "mov.u64 \t$dst, $a;",
- [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-
-// Get pointer to local stack.
-let hasSideEffects = 0 in {
- def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
- "mov.u32 \t$d, __local_depot$num;", []>;
- def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
- "mov.u64 \t$d, __local_depot$num;", []>;
-}
-
-
-// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1, hasSideEffects=0 in {
- def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
- "mov.pred \t$dst, $sss;", []>;
- def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
- "mov.u16 \t$dst, $sss;", []>;
- def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
- "mov.u32 \t$dst, $sss;", []>;
- def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
- "mov.u64 \t$dst, $sss;", []>;
-
- def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
- // We have to use .b16 here as there's no mov.f16.
- "mov.b16 \t$dst, $src;", []>;
- def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "mov.f32 \t$dst, $src;", []>;
- def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
- "mov.f64 \t$dst, $src;", []>;
-}
-
-def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
- "mov.pred \t$dst, $src;",
- [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
- "mov.u16 \t$dst, $src;",
- [(set Int16Regs:$dst, imm:$src)]>;
-def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
- "mov.u32 \t$dst, $src;",
- [(set Int32Regs:$dst, imm:$src)]>;
-def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
- "mov.u64 \t$dst, $src;",
- [(set Int64Regs:$dst, imm:$src)]>;
-
-def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
- "mov.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, fpimm:$src)]>;
-def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
- "mov.f64 \t$dst, $src;",
- [(set Float64Regs:$dst, fpimm:$src)]>;
-
-def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
-
-//---- Copy Frame Index ----
-def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
- "add.u32 \t$dst, ${addr:add};",
- [(set Int32Regs:$dst, ADDRri:$addr)]>;
-def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
- "add.u64 \t$dst, ${addr:add};",
- [(set Int64Regs:$dst, ADDRri64:$addr)]>;
-
-//-----------------------------------
-// Comparison and Selection
-//-----------------------------------
-
-multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
- Instruction setp_16rr,
- Instruction setp_16ri,
- Instruction setp_16ir,
- Instruction setp_32rr,
- Instruction setp_32ri,
- Instruction setp_32ir,
- Instruction setp_64rr,
- Instruction setp_64ri,
- Instruction setp_64ir,
- Instruction set_16rr,
- Instruction set_16ri,
- Instruction set_16ir,
- Instruction set_32rr,
- Instruction set_32ri,
- Instruction set_32ir,
- Instruction set_64rr,
- Instruction set_64ri,
- Instruction set_64ir> {
- // i16 -> pred
- def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
- (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
- (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
- (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
- // i32 -> pred
- def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
- (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
- (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
- (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
- // i64 -> pred
- def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
- (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
- (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
- (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
-
- // i16 -> i32
- def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
- (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
- (set_16ri Int16Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
- (set_16ir imm:$a, Int16Regs:$b, Mode)>;
- // i32 -> i32
- def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
- (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
- (set_32ri Int32Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
- (set_32ir imm:$a, Int32Regs:$b, Mode)>;
- // i64 -> i32
- def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
- (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
- (set_64ri Int64Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
- (set_64ir imm:$a, Int64Regs:$b, Mode)>;
-}
-
-multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
- : ISET_FORMAT<OpNode, Mode,
- SETP_s16rr, SETP_s16ri, SETP_s16ir,
- SETP_s32rr, SETP_s32ri, SETP_s32ir,
- SETP_s64rr, SETP_s64ri, SETP_s64ir,
- SET_s16rr, SET_s16ri, SET_s16ir,
- SET_s32rr, SET_s32ri, SET_s32ir,
- SET_s64rr, SET_s64ri, SET_s64ir> {
- // TableGen doesn't like empty multiclasses.
- def : PatLeaf<(i32 0)>;
-}
-
-multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
- : ISET_FORMAT<OpNode, Mode,
- SETP_u16rr, SETP_u16ri, SETP_u16ir,
- SETP_u32rr, SETP_u32ri, SETP_u32ir,
- SETP_u64rr, SETP_u64ri, SETP_u64ir,
- SET_u16rr, SET_u16ri, SET_u16ir,
- SET_u32rr, SET_u32ri, SET_u32ir,
- SET_u64rr, SET_u64ri, SET_u64ir> {
- // TableGen doesn't like empty multiclasses.
- def : PatLeaf<(i32 0)>;
-}
-
-defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
-defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
-defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
-defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
-defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
-defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
-defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
-defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
-defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
-defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
-defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
-defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
-
-// i1 compares
-def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
- (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
- (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
-def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
- (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
- (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-
-// i1 compare -> i32
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
- (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
- (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-
-
-
-multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
- // f16 -> pred
- def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
- (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
- (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
- (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
- Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
- (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
- (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
- (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
-
- // f32 -> pred
- def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
- (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
- (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
- (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
- (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
-
- // f64 -> pred
- def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
- (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
- (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
- (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
-
- // f16 -> i32
- def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
- (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
- (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
- (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
- Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
- (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
- (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
- (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
-
- // f32 -> i32
- def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
- (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
- (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
- (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
- Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
- (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
-
- // f64 -> i32
- def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
- (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
- (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
- (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
-}
-
-defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
-defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
-defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
-defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
-defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
-defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
-
-defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
-defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
-defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
-defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
-defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
-defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
-
-defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
-defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
-defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
-defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
-defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
-defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
-
-defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
-defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
-
-// FIXME: What is this doing here? Can it be deleted?
-// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
-// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
-def SDTDeclareParamProfile :
- SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTDeclareScalarParamProfile :
- SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
-def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
-def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
-def SDTCallValProfile : SDTypeProfile<1, 0, []>;
-def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
-def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
-def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
-def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
-def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
-
-def DeclareParam :
- SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareScalarParam :
- SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRetParam :
- SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRet :
- SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LoadParam :
- SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
- SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
- SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
- [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def PrintCall :
- SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCall :
- SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintCallUni :
- SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCallUni :
- SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParam :
- SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
- SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
- SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamU32 :
- SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamS32 :
- SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgBegin :
- SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArg :
- SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LastCallArg :
- SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgEnd :
- SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVoid :
- SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def Prototype :
- SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVal :
- SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def MoveParam :
- SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
-def StoreRetval :
- SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
- [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV2 :
- SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
- [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV4 :
- SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
- [SDNPHasChain, SDNPSideEffect]>;
-def PseudoUseParam :
- SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def RETURNNode :
- SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
- [SDNPHasChain, SDNPSideEffect]>;
-
-let mayLoad = 1 in {
- class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
- !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
- []>;
-
- class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
- !strconcat("ld.param.v2", opstr,
- " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
-
- class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins i32imm:$b),
- !strconcat("ld.param.v4", opstr,
- " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
- []>;
-}
-
-class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
- !strconcat("mov", opstr, " \t$dst, retval$b;"),
- [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
-
-let mayStore = 1 in {
- class StoreParamInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
- !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
- []>;
-
- class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
- i32imm:$a, i32imm:$b),
- !strconcat("st.param.v2", opstr,
- " \t[param$a+$b], {{$val, $val2}};"),
- []>;
-
- class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
- regclass:$val4, i32imm:$a,
- i32imm:$b),
- !strconcat("st.param.v4", opstr,
- " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
- []>;
-
- class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
- !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
- []>;
-
- class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
- !strconcat("st.param.v2", opstr,
- " \t[func_retval0+$a], {{$val, $val2}};"),
- []>;
-
- class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs),
- (ins regclass:$val, regclass:$val2, regclass:$val3,
- regclass:$val4, i32imm:$a),
- !strconcat("st.param.v4", opstr,
- " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
- []>;
-}
-
-let isCall=1 in {
- multiclass CALL<string OpcStr, SDNode OpNode> {
- def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
- def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
- def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
- def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
- def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
- [(OpNode (i32 4))]>;
- def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
- [(OpNode (i32 5))]>;
- def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
- "retval5), "),
- [(OpNode (i32 6))]>;
- def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
- "retval5, retval6), "),
- [(OpNode (i32 7))]>;
- def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
- !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
- "retval5, retval6, retval7), "),
- [(OpNode (i32 8))]>;
- }
-}
-
-defm Call : CALL<"call", PrintCall>;
-defm CallUni : CALL<"call.uni", PrintCallUni>;
-
-// Convergent call instructions. These are identical to regular calls, except
-// they have the isConvergent bit set.
-let isConvergent=1 in {
- defm ConvergentCall : CALL<"call", PrintConvergentCall>;
- defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
-}
-
-def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
-def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
-def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
-def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
-def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
-def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
-def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
-def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
-def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
-def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
-def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">;
-def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">;
-def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
-def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
-def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">;
-def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
-def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
-def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
-def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">;
-def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
-def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
-
-def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
-def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
-
-def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
-def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
-def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
-def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
-def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
-def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
-
-def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
-def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
-
-def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">;
-def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">;
-def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
-def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">;
-def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">;
-def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
-def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
-def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">;
-def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">;
-def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
-
-def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
-def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
-def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
-def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
-def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
-def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
-def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
-def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
-def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
-def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
-def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
-
-def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
-def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
-def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">;
-def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">;
-def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
-def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
-def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">;
-def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
-def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
-def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">;
-def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
-
-def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
-def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
-def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
-def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
-
-class CallArgInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$a), "$a, ",
- [(CallArg (i32 0), regclass:$a)]>;
-
-class LastCallArgInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$a), "$a",
- [(LastCallArg (i32 0), regclass:$a)]>;
-
-def CallArgI64 : CallArgInst<Int64Regs>;
-def CallArgI32 : CallArgInst<Int32Regs>;
-def CallArgI16 : CallArgInst<Int16Regs>;
-def CallArgF64 : CallArgInst<Float64Regs>;
-def CallArgF32 : CallArgInst<Float32Regs>;
-
-def LastCallArgI64 : LastCallArgInst<Int64Regs>;
-def LastCallArgI32 : LastCallArgInst<Int32Regs>;
-def LastCallArgI16 : LastCallArgInst<Int16Regs>;
-def LastCallArgF64 : LastCallArgInst<Float64Regs>;
-def LastCallArgF32 : LastCallArgInst<Float32Regs>;
-
-def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
- [(CallArg (i32 0), (i32 imm:$a))]>;
-def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
- [(LastCallArg (i32 0), (i32 imm:$a))]>;
-
-def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
- [(CallArg (i32 1), (i32 imm:$a))]>;
-def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
- [(LastCallArg (i32 1), (i32 imm:$a))]>;
-
-def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
- [(CallVoid (Wrapper tglobaladdr:$addr))]>;
-def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
- [(CallVoid Int32Regs:$addr)]>;
-def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
- [(CallVoid Int64Regs:$addr)]>;
-def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
- [(Prototype (i32 imm:$val))]>;
-
-def DeclareRetMemInst :
- NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
- ".param .align $align .b8 retval$num[$size];",
- [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetScalarInst :
- NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
- ".param .b$size retval$num;",
- [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetRegInst :
- NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
- ".reg .b$size retval$num;",
- [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
-
-def DeclareParamInst :
- NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
- ".param .align $align .b8 param$a[$size];",
- [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
-def DeclareScalarParamInst :
- NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
- ".param .b$size param$a;",
- [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
-def DeclareScalarRegInst :
- NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
- ".reg .b$size param$a;",
- [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
-
-class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
- NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
- !strconcat("mov", asmstr, " \t$dst, $src;"),
- [(set regclass:$dst, (MoveParam regclass:$src))]>;
-
-def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
-def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
-def MoveParamI16 :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "cvt.u16.u32 \t$dst, $src;",
- [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
-def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
-def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
-def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
-
-class PseudoUseParamInst<NVPTXRegClass regclass> :
- NVPTXInst<(outs), (ins regclass:$src),
- "// Pseudo use of $src",
- [(PseudoUseParam regclass:$src)]>;
-
-def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
-def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
-def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
-def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
-def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
-
-
-//
-// Load / Store Handling
-//
-multiclass LD<NVPTXRegClass regclass> {
- def _avar : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr];", []>;
- def _areg : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr];", []>;
- def _areg_64 : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr];", []>;
- def _ari : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
- def _ari_64 : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
- def _asi : NVPTXInst<
- (outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
-}
-
-let mayLoad=1, hasSideEffects=0 in {
- defm LD_i8 : LD<Int16Regs>;
- defm LD_i16 : LD<Int16Regs>;
- defm LD_i32 : LD<Int32Regs>;
- defm LD_i64 : LD<Int64Regs>;
- defm LD_f16 : LD<Float16Regs>;
- defm LD_f16x2 : LD<Float16x2Regs>;
- defm LD_f32 : LD<Float32Regs>;
- defm LD_f64 : LD<Float64Regs>;
-}
-
-multiclass ST<NVPTXRegClass regclass> {
- def _avar : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr], $src;", []>;
- def _areg : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr], $src;", []>;
- def _areg_64 : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr], $src;", []>;
- def _ari : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
- def _ari_64 : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
- def _asi : NVPTXInst<
- (outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
-}
-
-let mayStore=1, hasSideEffects=0 in {
- defm ST_i8 : ST<Int16Regs>;
- defm ST_i16 : ST<Int16Regs>;
- defm ST_i32 : ST<Int32Regs>;
- defm ST_i64 : ST<Int64Regs>;
- defm ST_f16 : ST<Float16Regs>;
- defm ST_f16x2 : ST<Float16x2Regs>;
- defm ST_f32 : ST<Float32Regs>;
- defm ST_f64 : ST<Float64Regs>;
-}
-
-// The following is used only in and after vector elementizations. Vector
-// elementization happens at the machine instruction level, so the following
-// instructions never appear in the DAG.
-multiclass LD_VEC<NVPTXRegClass regclass> {
- def _v2_avar : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr];", []>;
- def _v2_areg : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr];", []>;
- def _v2_areg_64 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr];", []>;
- def _v2_ari : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
- def _v2_ari_64 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
- def _v2_asi : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
- def _v4_avar : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
- def _v4_areg : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
- def _v4_areg_64 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
- def _v4_ari : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
- def _v4_ari_64 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
- def _v4_asi : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
-}
-let mayLoad=1, hasSideEffects=0 in {
- defm LDV_i8 : LD_VEC<Int16Regs>;
- defm LDV_i16 : LD_VEC<Int16Regs>;
- defm LDV_i32 : LD_VEC<Int32Regs>;
- defm LDV_i64 : LD_VEC<Int64Regs>;
- defm LDV_f16 : LD_VEC<Float16Regs>;
- defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
- defm LDV_f32 : LD_VEC<Float32Regs>;
- defm LDV_f64 : LD_VEC<Float64Regs>;
-}
-
-multiclass ST_VEC<NVPTXRegClass regclass> {
- def _v2_avar : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2}};", []>;
- def _v2_areg : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2}};", []>;
- def _v2_areg_64 : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2}};", []>;
- def _v2_ari : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
- def _v2_ari_64 : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
- def _v2_asi : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
- def _v4_avar : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
- def _v4_areg : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
- def _v4_areg_64 : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
- def _v4_ari : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
- def _v4_ari_64 : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
- def _v4_asi : NVPTXInst<
- (outs),
- (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
- "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
-}
-
-let mayStore=1, hasSideEffects=0 in {
- defm STV_i8 : ST_VEC<Int16Regs>;
- defm STV_i16 : ST_VEC<Int16Regs>;
- defm STV_i32 : ST_VEC<Int32Regs>;
- defm STV_i64 : ST_VEC<Int64Regs>;
- defm STV_f16 : ST_VEC<Float16Regs>;
- defm STV_f16x2 : ST_VEC<Float16x2Regs>;
- defm STV_f32 : ST_VEC<Float32Regs>;
- defm STV_f64 : ST_VEC<Float64Regs>;
-}
-
-//---- Conversion ----
-
-class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
- NVPTXRegClass regclassOut> :
- NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
- !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
- [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
-
-def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
-def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
-def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
-def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
-def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
-def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
-def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
-def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
-
-// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
-// we cannot specify floating-point literals in isel patterns. Therefore, we
-// use an integer selp to select either 1 or 0 and then cvt to floating-point.
-
-// sint -> f16
-def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
- (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
- (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
- (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
- (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
-
-// uint -> f16
-def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
- (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
- (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
- (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
- (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
-
-// sint -> f32
-def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
- (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
- (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
- (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
- (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
-
-// uint -> f32
-def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
- (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
- (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
- (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
- (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
-
-// sint -> f64
-def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
- (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
- (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
- (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
- (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
-
-// uint -> f64
-def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
- (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
- (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
- (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
- (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
-
-
-// f16 -> sint
-def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
- (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
- (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
- (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
- (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
- (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
- (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
- (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
-
-// f16 -> uint
-def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
- (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
- (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
- (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
- (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
- (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
- (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
- (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
-
-// f32 -> sint
-def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
- (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
- (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
- (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
- (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
- (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
- (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
-
-// f32 -> uint
-def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
- (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
- (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
- (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
- (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
- (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
- (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
-
-// f64 -> sint
-def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
- (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
- (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
- (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
-
-// f64 -> uint
-def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
- (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
- (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
- (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
-
-// sext i1
-def : Pat<(i16 (sext Int1Regs:$a)),
- (SELP_s16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (sext Int1Regs:$a)),
- (SELP_s32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (sext Int1Regs:$a)),
- (SELP_s64ii -1, 0, Int1Regs:$a)>;
-
-// zext i1
-def : Pat<(i16 (zext Int1Regs:$a)),
- (SELP_u16ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (zext Int1Regs:$a)),
- (SELP_u32ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (zext Int1Regs:$a)),
- (SELP_u64ii 1, 0, Int1Regs:$a)>;
-
-// anyext i1
-def : Pat<(i16 (anyext Int1Regs:$a)),
- (SELP_u16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (anyext Int1Regs:$a)),
- (SELP_u32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (anyext Int1Regs:$a)),
- (SELP_u64ii -1, 0, Int1Regs:$a)>;
-
-// sext i16
-def : Pat<(i32 (sext Int16Regs:$a)),
- (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (sext Int16Regs:$a)),
- (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
-
-// zext i16
-def : Pat<(i32 (zext Int16Regs:$a)),
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (zext Int16Regs:$a)),
- (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
-
-// anyext i16
-def : Pat<(i32 (anyext Int16Regs:$a)),
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (anyext Int16Regs:$a)),
- (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
-
-// sext i32
-def : Pat<(i64 (sext Int32Regs:$a)),
- (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
-
-// zext i32
-def : Pat<(i64 (zext Int32Regs:$a)),
- (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
-
-// anyext i32
-def : Pat<(i64 (anyext Int32Regs:$a)),
- (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
-
-
-// truncate i64
-def : Pat<(i32 (trunc Int64Regs:$a)),
- (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i16 (trunc Int64Regs:$a)),
- (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int64Regs:$a)),
- (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
-
-// truncate i32
-def : Pat<(i16 (trunc Int32Regs:$a)),
- (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int32Regs:$a)),
- (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
-
-// truncate i16
-def : Pat<(i1 (trunc Int16Regs:$a)),
- (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
-
-// sext_inreg
-def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
-
-
-// Select instructions with 32-bit predicates
-def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
- (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
- (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
- (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
- (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
- (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
- (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-
-
-let hasSideEffects = 0 in {
- // pack a set of smaller int registers to a larger int register
- def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2,
- Int16Regs:$s3, Int16Regs:$s4),
- "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
- def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2),
- "mov.b32 \t$d, {{$s1, $s2}};", []>;
- def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
- (ins Int32Regs:$s1, Int32Regs:$s2),
- "mov.b64 \t$d, {{$s1, $s2}};", []>;
- def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
- (ins Float32Regs:$s1, Float32Regs:$s2),
- "mov.b64 \t$d, {{$s1, $s2}};", []>;
-
- // unpack a larger int register to a set of smaller int registers
- def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
- Int16Regs:$d3, Int16Regs:$d4),
- (ins Int64Regs:$s),
- "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
- def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
- (ins Int32Regs:$s),
- "mov.b32 \t{{$d1, $d2}}, $s;", []>;
- def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
- (ins Int64Regs:$s),
- "mov.b64 \t{{$d1, $d2}}, $s;", []>;
- def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
- (ins Float64Regs:$s),
- "mov.b64 \t{{$d1, $d2}}, $s;", []>;
-
-}
-
-let hasSideEffects = 0 in {
- // Extract element of f16x2 register. PTX does not provide any way
- // to access elements of f16x2 vector directly, so we need to
- // extract it using a temporary register.
- def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16x2Regs:$src),
- "{{ .reg .b16 \t%tmp_hi;\n\t"
- " mov.b32 \t{$dst, %tmp_hi}, $src; }}",
- [(set Float16Regs:$dst,
- (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
- def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
- (ins Float16x2Regs:$src),
- "{{ .reg .b16 \t%tmp_lo;\n\t"
- " mov.b32 \t{%tmp_lo, $dst}, $src; }}",
- [(set Float16Regs:$dst,
- (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
-
- // Coalesce two f16 registers into f16x2
- def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
- (ins Float16Regs:$a, Float16Regs:$b),
- "mov.b32 \t$dst, {{$a, $b}};",
- [(set Float16x2Regs:$dst,
- (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
-
- // Directly initializing underlying the b32 register is one less SASS
- // instruction than than vector-packing move.
- def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
- "mov.b32 \t$dst, $src;",
- []>;
-
- // Split f16x2 into two f16 registers.
- def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
- (ins Float16x2Regs:$src),
- "mov.b32 \t{{$lo, $hi}}, $src;",
- []>;
- // Split an i32 into two f16
- def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
- (ins Int32Regs:$src),
- "mov.b32 \t{{$lo, $hi}}, $src;",
- []>;
-}
-
-// Count leading zeros
-let hasSideEffects = 0 in {
- def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "clz.b32 \t$d, $a;", []>;
- def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "clz.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
-
-// The return type of the ctlz ISD node is the same as its input, but the PTX
-// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
-// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
-// truncating back down to 32 bits.
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
-
-// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
-// result back to 16-bits if necessary. We also need to subtract 16 because
-// the high-order 16 zeros were counted.
-//
-// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
-// use to save one SASS instruction (on sm_35 anyway):
-//
-// mov.b32 $tmp, {0xffff, $a}
-// ctlz.b32 $result, $tmp
-//
-// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
-// and then ctlz that value. This way we don't have to subtract 16 from the
-// result. Unfortunately today we don't have a way to generate
-// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(ctlz Int16Regs:$a),
- (SUBi16ri (CVT_u16_u32
- (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
- (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
-
-// Population count
-let hasSideEffects = 0 in {
- def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "popc.b32 \t$d, $a;", []>;
- def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "popc.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
-// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
-// pattern that avoids the type conversion if we're truncating the result to
-// i32 anyway.
-def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
-// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop Int16Regs:$a),
- (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
- (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
-
-// fpround f32 -> f16
-def : Pat<(f16 (fpround Float32Regs:$a)),
- (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f16 (fpround Float32Regs:$a)),
- (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
-
-// fpround f64 -> f16
-def : Pat<(f16 (fpround Float64Regs:$a)),
- (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f16 (fpround Float64Regs:$a)),
- (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
-
-// fpround f64 -> f32
-def : Pat<(f32 (fpround Float64Regs:$a)),
- (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpround Float64Regs:$a)),
- (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
-
-// fpextend f16 -> f32
-def : Pat<(f32 (fpextend Float16Regs:$a)),
- (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend Float16Regs:$a)),
- (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
-
-// fpextend f16 -> f64
-def : Pat<(f64 (fpextend Float16Regs:$a)),
- (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f64 (fpextend Float16Regs:$a)),
- (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
-
-// fpextend f32 -> f64
-def : Pat<(f64 (fpextend Float32Regs:$a)),
- (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f64 (fpextend Float32Regs:$a)),
- (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
-
-def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
-
-// fceil, ffloor, fround, ftrunc.
-
-def : Pat<(fceil Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fceil Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fceil Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fceil Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fceil Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
-
-def : Pat<(ffloor Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ffloor Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ffloor Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ffloor Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ffloor Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
-
-def : Pat<(fround Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f16 (fround Float16Regs:$a)),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fround Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fround Float32Regs:$a)),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(f64 (fround Float64Regs:$a)),
- (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-def : Pat<(ftrunc Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ftrunc Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ftrunc Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ftrunc Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ftrunc Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
-
-// nearbyint and rint are implemented as rounding to nearest even. This isn't
-// strictly correct, because it causes us to ignore the rounding mode. But it
-// matches what CUDA's "libm" does.
-
-def : Pat<(fnearbyint Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fnearbyint Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fnearbyint Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fnearbyint Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fnearbyint Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-def : Pat<(frint Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(frint Float16Regs:$a),
- (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(frint Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(frint Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(frint Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-
-//-----------------------------------
-// Control-flow
-//-----------------------------------
-
-let isTerminator=1 in {
- let isReturn=1, isBarrier=1 in
- def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
-
- let isBranch=1 in
- def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
- "@$a bra \t$target;",
- [(brcond Int1Regs:$a, bb:$target)]>;
- let isBranch=1 in
- def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
- "@!$a bra \t$target;", []>;
-
- let isBranch=1, isBarrier=1 in
- def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
- "bra.uni \t$target;", [(br bb:$target)]>;
-}
-
-def : Pat<(brcond Int32Regs:$a, bb:$target),
- (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
-
-// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
-// conditional branch if the target block is the next block so that the code
-// can fall through to the target block. The invertion is done by 'xor
-// condition, 1', which will be translated to (setne condition, -1). Since ptx
-// supports '@!pred bra target', we should use it.
-def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
- (CBranchOther Int1Regs:$a, bb:$target)>;
-
-// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
- SDTCisVT<1, i32>]>;
-def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
- [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
- SDNPSideEffect]>;
-
-def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
- [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def calltarget : Operand<i32>;
-let isCall=1 in {
- def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
-}
-
-def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
-def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
-
-// Pseudo instructions.
-class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
- : NVPTXInst<outs, ins, asmstr, pattern>;
-
-def Callseq_Start :
- NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- "\\{ // callseq $amt1, $amt2\n"
- "\t.reg .b32 temp_param_reg;",
- [(callseq_start timm:$amt1, timm:$amt2)]>;
-def Callseq_End :
- NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- "\\} // callseq $amt1",
- [(callseq_end timm:$amt1, timm:$amt2)]>;
-
-// trap instruction
-def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
-
-// Call prototype wrapper
-def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def CallPrototype :
- SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
- [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def ProtoIdent : Operand<i32> {
- let PrintMethod = "printProtoIdent";
-}
-def CALL_PROTOTYPE :
- NVPTXInst<(outs), (ins ProtoIdent:$ident),
- "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
-
-
-include "NVPTXIntrinsics.td"
-
-
-//-----------------------------------
-// Notes
-//-----------------------------------
-// BSWAP is currently expanded. The following is a more efficient
-// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
-// - for sm_20, use pmpt (use vector scalar mov to get the pack and
-// unpack). sm_20 supports native 32-bit register, but not native 16-bit
-// register.
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PTX instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +include "NVPTXInstrFormats.td" + +// A NOP instruction +let hasSideEffects = 0 in { + def NOP : NVPTXInst<(outs), (ins), "", []>; +} + +let OperandType = "OPERAND_IMMEDIATE" in { + def f16imm : Operand<f16>; +} + +// List of vector specific properties +def isVecLD : VecInstTypeEnum<1>; +def isVecST : VecInstTypeEnum<2>; +def isVecBuild : VecInstTypeEnum<3>; +def isVecShuffle : VecInstTypeEnum<4>; +def isVecExtract : VecInstTypeEnum<5>; +def isVecInsert : VecInstTypeEnum<6>; +def isVecDest : VecInstTypeEnum<7>; +def isVecOther : VecInstTypeEnum<15>; + +//===----------------------------------------------------------------------===// +// NVPTX Operand Definitions. +//===----------------------------------------------------------------------===// + +def brtarget : Operand<OtherVT>; + +// CVT conversion modes +// These must match the enum in NVPTX.h +def CvtNONE : PatLeaf<(i32 0x0)>; +def CvtRNI : PatLeaf<(i32 0x1)>; +def CvtRZI : PatLeaf<(i32 0x2)>; +def CvtRMI : PatLeaf<(i32 0x3)>; +def CvtRPI : PatLeaf<(i32 0x4)>; +def CvtRN : PatLeaf<(i32 0x5)>; +def CvtRZ : PatLeaf<(i32 0x6)>; +def CvtRM : PatLeaf<(i32 0x7)>; +def CvtRP : PatLeaf<(i32 0x8)>; + +def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; +def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; +def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; +def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; +def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; +def CvtRN_FTZ : PatLeaf<(i32 0x15)>; +def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; +def CvtRM_FTZ : PatLeaf<(i32 0x17)>; +def CvtRP_FTZ : PatLeaf<(i32 0x18)>; + +def CvtSAT : PatLeaf<(i32 0x20)>; +def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; + +def CvtMode : Operand<i32> { + let PrintMethod = "printCvtMode"; +} + +// Compare modes +// These must match the enum in NVPTX.h +def CmpEQ : PatLeaf<(i32 0)>; +def CmpNE : PatLeaf<(i32 1)>; +def CmpLT : PatLeaf<(i32 2)>; +def CmpLE : PatLeaf<(i32 3)>; +def CmpGT : PatLeaf<(i32 4)>; +def CmpGE : PatLeaf<(i32 5)>; +def CmpEQU : PatLeaf<(i32 10)>; +def CmpNEU : PatLeaf<(i32 11)>; +def CmpLTU : PatLeaf<(i32 12)>; +def CmpLEU : PatLeaf<(i32 13)>; +def CmpGTU : PatLeaf<(i32 14)>; +def CmpGEU : PatLeaf<(i32 15)>; +def CmpNUM : PatLeaf<(i32 16)>; +def CmpNAN : PatLeaf<(i32 17)>; + +def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; +def CmpNE_FTZ : PatLeaf<(i32 0x101)>; +def CmpLT_FTZ : PatLeaf<(i32 0x102)>; +def CmpLE_FTZ : PatLeaf<(i32 0x103)>; +def CmpGT_FTZ : PatLeaf<(i32 0x104)>; +def CmpGE_FTZ : PatLeaf<(i32 0x105)>; +def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; +def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; +def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; +def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; +def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; +def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; +def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; +def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; + +def CmpMode : Operand<i32> { + let PrintMethod = "printCmpMode"; +} +def VecElement : Operand<i32> { + let PrintMethod = "printVecElement"; +} + +//===----------------------------------------------------------------------===// +// NVPTX Instruction Predicate Definitions +//===----------------------------------------------------------------------===// + + +def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">; +def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">; +def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">; +def useAtomRedG32forGen32 : + Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">; +def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">; +def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">; +def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">; +def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">; +def useAtomRedG64forGen64 : + Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; +def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; +def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; +def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; +def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; +def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; +def hasVote : Predicate<"Subtarget->hasVote()">; +def hasDouble : Predicate<"Subtarget->hasDouble()">; +def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; +def hasLDG : Predicate<"Subtarget->hasLDG()">; +def hasLDU : Predicate<"Subtarget->hasLDU()">; +def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">; + +def doF32FTZ : Predicate<"useF32FTZ()">; +def doNoF32FTZ : Predicate<"!useF32FTZ()">; + +def doMulWide : Predicate<"doMulWide">; + +def allowFMA : Predicate<"allowFMA()">; +def noFMA : Predicate<"!allowFMA()">; +def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; + +def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; +def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; + +def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; +def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; + +def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; + +def true : Predicate<"true">; + +def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; +def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; + +def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; +def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; + +def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; + +//===----------------------------------------------------------------------===// +// Some Common Instruction Class Templates +//===----------------------------------------------------------------------===// + +// Template for instructions which take three int64, int32, or int16 args. +// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). +multiclass I3<string OpcStr, SDNode OpNode> { + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; +} + +// Template for instructions which take 3 int32 args. The instructions are +// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). +multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> { + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; +} + +// Template for instructions which take three fp64 or fp32 args. The +// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. +// +// This multiclass should be used for nodes that cannot be folded into FMAs. +// For nodes that can be folded into FMAs (i.e. adds and muls), use +// F3_fma_component. +multiclass F3<string OpcStr, SDNode OpNode> { + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; +} + +// Template for instructions which take three FP args. The +// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32/fp16 functions. +// +// This multiclass should be used for nodes that can be folded to make fma ops. +// In this case, we use the ".rn" variant when FMA is disabled, as this behaves +// just like the non ".rn" op, but prevents ptxas from creating FMAs. +multiclass F3_fma_component<string OpcStr, SDNode OpNode> { + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[allowFMA]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + + def f16rr_ftz : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, allowFMA, doF32FTZ]>; + def f16rr : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, allowFMA]>; + + def f16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA, doF32FTZ]>; + def f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA]>; + + // These have strange names so we don't perturb existing mir tests. + def _rnf64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; + def _rnf64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; + def _rnf32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def _rnf32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def _rnf32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; + def _rnf32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; + def _rnf16rr_ftz : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, noFMA, doF32FTZ]>; + def _rnf16rr : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, noFMA]>; + def _rnf16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA, doF32FTZ]>; + def _rnf16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA]>; +} + +// Template for operations which take two f32 or f64 operands. Provides three +// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush +// subnormal inputs and results to zero). +multiclass F2<string OpcStr, SDNode OpNode> { + def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), + !strconcat(OpcStr, ".f64 \t$dst, $a;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; + def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, + Requires<[doF32FTZ]>; + def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; +} + +//===----------------------------------------------------------------------===// +// NVPTX Instructions. +//===----------------------------------------------------------------------===// + +//----------------------------------- +// Type Conversion +//----------------------------------- + +let hasSideEffects = 0 in { + // Generate a cvt to the given type from all possible types. Each instance + // takes a CvtMode immediate that defines the conversion mode to use. It can + // be CvtNONE to omit a conversion mode. + multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> { + def _s8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s8 \t$dst, $src;"), []>; + def _u8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u8 \t$dst, $src;"), []>; + def _s16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s16 \t$dst, $src;"), []>; + def _u16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u16 \t$dst, $src;"), []>; + def _s32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s32 \t$dst, $src;"), []>; + def _u32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u32 \t$dst, $src;"), []>; + def _s64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s64 \t$dst, $src;"), []>; + def _u64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u64 \t$dst, $src;"), []>; + def _f16 : + NVPTXInst<(outs RC:$dst), + (ins Float16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f16 \t$dst, $src;"), []>; + def _f32 : + NVPTXInst<(outs RC:$dst), + (ins Float32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f32 \t$dst, $src;"), []>; + def _f64 : + NVPTXInst<(outs RC:$dst), + (ins Float64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f64 \t$dst, $src;"), []>; + } + + // Generate cvts from all types to all types. + defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; + defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; + defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; + defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; + defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; + defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; + defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; + defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; + defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>; + defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; + defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; + + // These cvts are different from those above: The source and dest registers + // are of the same type. + def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.s16.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s8 \t$dst, $src;", []>; + def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s32 \t$dst, $src;", []>; +} + +//----------------------------------- +// Integer Arithmetic +//----------------------------------- + +// Template for xor masquerading as int1 arithmetic. +multiclass ADD_SUB_i1<SDNode OpNode> { + def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; +} + +// int1 addition and subtraction are both just xor. +defm ADD_i1 : ADD_SUB_i1<add>; +defm SUB_i1 : ADD_SUB_i1<sub>; + +// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we +// also use these for unsigned arithmetic. +defm ADD : I3<"add.s", add>; +defm SUB : I3<"sub.s", sub>; + +// int32 addition and subtraction with carry-out. +// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?). +defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>; +defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>; + +// int32 addition and subtraction with carry-in and carry-out. +defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>; +defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>; + +defm MULT : I3<"mul.lo.s", mul>; + +defm MULTHS : I3<"mul.hi.s", mulhs>; +defm MULTHU : I3<"mul.hi.u", mulhu>; + +defm SDIV : I3<"div.s", sdiv>; +defm UDIV : I3<"div.u", udiv>; + +// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM +// will lower it. +defm SREM : I3<"rem.s", srem>; +defm UREM : I3<"rem.u", urem>; + +// Integer absolute value. NumBits should be one minus the bit width of RC. +// This idiom implements the algorithm at +// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. +multiclass ABS<RegisterClass RC, string SizeName> { + def : NVPTXInst<(outs RC:$dst), (ins RC:$a), + !strconcat("abs", SizeName, " \t$dst, $a;"), + [(set RC:$dst, (abs RC:$a))]>; +} +defm ABS_16 : ABS<Int16Regs, ".s16">; +defm ABS_32 : ABS<Int32Regs, ".s32">; +defm ABS_64 : ABS<Int64Regs, ".s64">; + +// Integer min/max. +defm SMAX : I3<"max.s", smax>; +defm UMAX : I3<"max.u", umax>; +defm SMIN : I3<"min.s", smin>; +defm UMIN : I3<"min.u", umin>; + +// +// Wide multiplication +// +def MULWIDES64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; + +def MULWIDEU64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; + +def MULWIDES32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; + +def MULWIDEU32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; + +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +// Matchers for signed, unsigned mul.wide ISD nodes. +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +// Predicates used for converting some patterns to mul.wide. +def SInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isSignedIntN(32); +}]>; + +def UInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isIntN(32); +}]>; + +def SInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isSignedIntN(16); +}]>; + +def UInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isIntN(16); +}]>; + +def Int5Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 32; only then will the result of (x << v) be an int32. + const APInt &v = N->getAPIntValue(); + return v.sge(0) && v.slt(32); +}]>; + +def Int4Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 16; only then will the result of (x << v) be an int16. + const APInt &v = N->getAPIntValue(); + return v.sge(0) && v.slt(16); +}]>; + +def SHL2MUL32 : SDNodeXForm<imm, [{ + const APInt &v = N->getAPIntValue(); + APInt temp(32, 1); + return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); +}]>; + +def SHL2MUL16 : SDNodeXForm<imm, [{ + const APInt &v = N->getAPIntValue(); + APInt temp(16, 1); + return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); +}]>; + +// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. +def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; + +def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; + +// Convert "sign/zero-extend then multiply" to mul.wide. +def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + +// +// Integer multiply-add +// +def SDTIMAD : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; +def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + +def MAD16rrr : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; +def MAD16rri : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; +def MAD16rir : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; +def MAD16rii : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; + +def MAD32rrr : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; +def MAD32rri : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; +def MAD32rir : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; +def MAD32rii : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; + +def MAD64rrr : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; +def MAD64rri : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; +def MAD64rir : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; +def MAD64rii : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; + +def INEG16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "neg.s16 \t$dst, $src;", + [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; +def INEG32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "neg.s32 \t$dst, $src;", + [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; +def INEG64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "neg.s64 \t$dst, $src;", + [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; + +//----------------------------------- +// Floating Point Arithmetic +//----------------------------------- + +// Constant 1.0f +def FloatConst1 : PatLeaf<(fpimm), [{ + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && + N->getValueAPF().convertToFloat() == 1.0f; +}]>; +// Constant 1.0 (double) +def DoubleConst1 : PatLeaf<(fpimm), [{ + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && + N->getValueAPF().convertToDouble() == 1.0; +}]>; + +// Loads FP16 constant into a register. +// +// ptxas does not have hex representation for fp16, so we can't use +// fp16 immediate values in .f16 instructions. Instead we have to load +// the constant into a register using mov.b16. +def LOAD_CONST_F16 : + NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a), + "mov.b16 \t$dst, $a;", []>; + +defm FADD : F3_fma_component<"add", fadd>; +defm FSUB : F3_fma_component<"sub", fsub>; +defm FMUL : F3_fma_component<"mul", fmul>; + +defm FMIN : F3<"min", fminnum>; +defm FMAX : F3<"max", fmaxnum>; + +defm FABS : F2<"abs", fabs>; +defm FNEG : F2<"neg", fneg>; +defm FSQRT : F2<"sqrt.rn", fsqrt>; + +// +// F64 division +// +def FDIV641r : + NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, Float64Regs:$b), + "rcp.rn.f64 \t$dst, $b;", + [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; +def FDIV64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; +def FDIV64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; + +// +// F32 Approximate reciprocal +// +def FDIV321r_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV321r : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Approximate division +// +def FDIV32approxrr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxrr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +def FDIV32approxri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Semi-accurate reciprocal +// +// rcp.approx gives the same result as div.full(1.0f, a) and is faster. +// +def FDIV321r_approx_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV321r_approx : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Semi-accurate division +// +def FDIV32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +def FDIV32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Accurate reciprocal +// +def FDIV321r_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20, doF32FTZ]>; +def FDIV321r_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +// +// F32 Accurate division +// +def FDIV32rr_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32ri_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32rr_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +def FDIV32ri_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[reqPTX20]>; + +// +// FMA +// + +multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; +} + +multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> { + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[useFP16Math, Pred]>; +} + +defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>; +defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>; +defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>; +defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>; +defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; +defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>; +defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>; + +// sin/cos +def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "sin.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, + Requires<[allowUnsafeFPMath]>; +def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "cos.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, + Requires<[allowUnsafeFPMath]>; + +// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)), +// i.e. "poor man's fmod()" + +// frem - f32 FTZ +def : Pat<(frem Float32Regs:$x, Float32Regs:$y), + (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 + (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ), + Float32Regs:$y))>, + Requires<[doF32FTZ]>; +def : Pat<(frem Float32Regs:$x, fpimm:$y), + (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 + (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ), + fpimm:$y))>, + Requires<[doF32FTZ]>; + +// frem - f32 +def : Pat<(frem Float32Regs:$x, Float32Regs:$y), + (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 + (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI), + Float32Regs:$y))>; +def : Pat<(frem Float32Regs:$x, fpimm:$y), + (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 + (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI), + fpimm:$y))>; + +// frem - f64 +def : Pat<(frem Float64Regs:$x, Float64Regs:$y), + (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 + (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI), + Float64Regs:$y))>; +def : Pat<(frem Float64Regs:$x, fpimm:$y), + (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 + (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI), + fpimm:$y))>; + +//----------------------------------- +// Bitwise operations +//----------------------------------- + +// Template for three-arg bitwise operations. Takes three args, Creates .b16, +// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. +multiclass BITWISE<string OpcStr, SDNode OpNode> { + def b1rr : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def b1ri : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; + def b16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def b16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def b32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def b32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def b64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def b64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; +} + +defm OR : BITWISE<"or", or>; +defm AND : BITWISE<"and", and>; +defm XOR : BITWISE<"xor", xor>; + +def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), + "not.pred \t$dst, $src;", + [(set Int1Regs:$dst, (not Int1Regs:$src))]>; +def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "not.b16 \t$dst, $src;", + [(set Int16Regs:$dst, (not Int16Regs:$src))]>; +def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "not.b32 \t$dst, $src;", + [(set Int32Regs:$dst, (not Int32Regs:$src))]>; +def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "not.b64 \t$dst, $src;", + [(set Int64Regs:$dst, (not Int64Regs:$src))]>; + +// Template for left/right shifts. Takes three operands, +// [dest (reg), src (reg), shift (reg or imm)]. +// dest and src may be int64, int32, or int16, but shift is always int32. +// +// This template also defines a 32-bit shift (imm, imm) instruction. +multiclass SHIFT<string OpcStr, SDNode OpNode> { + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; + def i32ii : + NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; +} + +defm SHL : SHIFT<"shl.b", shl>; +defm SRA : SHIFT<"shr.s", sra>; +defm SRL : SHIFT<"shr.u", srl>; + +// Bit-reverse +def BREV32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), + "brev.b32 \t$dst, $a;", + [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; +def BREV64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), + "brev.b64 \t$dst, $a;", + [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; + +// +// Rotate: Use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTL32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. +def ROT32imm_sw : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + "shl.b32 \t%lhs, $src, $amt1;\n\t" + "shr.b32 \t%rhs, $src, $amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; + +// 32-bit software rotate left by register. +def ROTL32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shl.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shr.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 32-bit software rotate right by register. +def ROTR32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shr.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shl.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. +def ROT64imm_sw : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + "shl.b64 \t%lhs, $src, $amt1;\n\t" + "shr.b64 \t%rhs, $src, $amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_64 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; +def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; + +// 64-bit software rotate left by register. +def ROTL64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shl.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shr.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; + +def ROTR64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shr.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shl.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; + +// +// Funnnel shift in clamp mode +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def SDTIntShiftDOp : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; + +// +// BFE - bit-field extract +// + +// Template for BFE instructions. Takes four args, +// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. +// Start may be an imm only if end is also an imm. FIXME: Is this a +// restriction in PTX? +// +// dest and src may be int32 or int64, but start and end are always int32. +multiclass BFE<string TyStr, RegisterClass RC> { + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +let hasSideEffects = 0 in { + defm BFE_S32 : BFE<"s32", Int32Regs>; + defm BFE_U32 : BFE<"u32", Int32Regs>; + defm BFE_S64 : BFE<"s64", Int64Regs>; + defm BFE_U64 : BFE<"u64", Int64Regs>; +} + +//----------------------------------- +// Comparison instructions (setp, set) +//----------------------------------- + +// FIXME: This doesn't cover versions of set and setp that combine with a +// boolean predicate, e.g. setp.eq.and.b16. + +let hasSideEffects = 0 in { + multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + def ri : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + def ir : + NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + } +} + +defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; +defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; +defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; +defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; +defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; +defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; +defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; +defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; +defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; +defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; +defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; +def SETP_f16rr : + NVPTXInst<(outs Int1Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), + "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", + []>, Requires<[useFP16Math]>; + +def SETP_f16x2rr : + NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), + (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), + "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", + []>, + Requires<[useFP16Math]>; + + +// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form +// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination +// reg, either u32, s32, or f32. Anyway these aren't used at the moment. + +let hasSideEffects = 0 in { + multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + def ri : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + def ir : NVPTXInst<(outs Int32Regs:$dst), + (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + } +} + +defm SET_b16 : SET<"b16", Int16Regs, i16imm>; +defm SET_s16 : SET<"s16", Int16Regs, i16imm>; +defm SET_u16 : SET<"u16", Int16Regs, i16imm>; +defm SET_b32 : SET<"b32", Int32Regs, i32imm>; +defm SET_s32 : SET<"s32", Int32Regs, i32imm>; +defm SET_u32 : SET<"u32", Int32Regs, i32imm>; +defm SET_b64 : SET<"b64", Int64Regs, i64imm>; +defm SET_s64 : SET<"s64", Int64Regs, i64imm>; +defm SET_u64 : SET<"u64", Int64Regs, i64imm>; +defm SET_f16 : SET<"f16", Float16Regs, f16imm>; +defm SET_f32 : SET<"f32", Float32Regs, f32imm>; +defm SET_f64 : SET<"f64", Float64Regs, f64imm>; + +//----------------------------------- +// Selection instructions (selp) +//----------------------------------- + +// FIXME: Missing slct + +// selp instructions that don't have any pattern matches; we explicitly use +// them within this file. +let hasSideEffects = 0 in { + multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { + def rr : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ii : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + } + + multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls, + SDNode ImmNode> { + def rr : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; + def ri : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; + def ir : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; + def ii : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; + } +} + +// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as +// good. +defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>; +defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; +defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; +defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>; +defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; +defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; +defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>; +defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; +defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; +defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>; +defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; +defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; + +def SELP_f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Float16x2Regs:$dst, + (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>; + +//----------------------------------- +// Data Movement (Load / Store, Move) +//----------------------------------- + +def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], + [SDNPWantRoot]>; +def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], + [SDNPWantRoot]>; + +def MEMri : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int32Regs, i32imm); +} +def MEMri64 : Operand<i64> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int64Regs, i64imm); +} + +def imem : Operand<iPTR> { + let PrintMethod = "printOperand"; +} + +def imemAny : Operand<iPTRAny> { + let PrintMethod = "printOperand"; +} + +def LdStCode : Operand<i32> { + let PrintMethod = "printLdStCode"; +} + +def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; + +// Load a memory address into a u32 or u64 register. +def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), + "mov.u32 \t$dst, $a;", + [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; +def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), + "mov.u64 \t$dst, $a;", + [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; + +// Get pointer to local stack. +let hasSideEffects = 0 in { + def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), + "mov.u32 \t$d, __local_depot$num;", []>; + def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), + "mov.u64 \t$d, __local_depot$num;", []>; +} + + +// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp +let IsSimpleMove=1, hasSideEffects=0 in { + def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), + "mov.pred \t$dst, $sss;", []>; + def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), + "mov.u16 \t$dst, $sss;", []>; + def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), + "mov.u32 \t$dst, $sss;", []>; + def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), + "mov.u64 \t$dst, $sss;", []>; + + def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src), + // We have to use .b16 here as there's no mov.f16. + "mov.b16 \t$dst, $src;", []>; + def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "mov.f32 \t$dst, $src;", []>; + def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), + "mov.f64 \t$dst, $src;", []>; +} + +def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set Int1Regs:$dst, imm:$src)]>; +def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.u16 \t$dst, $src;", + [(set Int16Regs:$dst, imm:$src)]>; +def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.u32 \t$dst, $src;", + [(set Int32Regs:$dst, imm:$src)]>; +def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.u64 \t$dst, $src;", + [(set Int64Regs:$dst, imm:$src)]>; + +def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set Float32Regs:$dst, fpimm:$src)]>; +def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set Float64Regs:$dst, fpimm:$src)]>; + +def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; + +//---- Copy Frame Index ---- +def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), + "add.u32 \t$dst, ${addr:add};", + [(set Int32Regs:$dst, ADDRri:$addr)]>; +def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), + "add.u64 \t$dst, ${addr:add};", + [(set Int64Regs:$dst, ADDRri64:$addr)]>; + +//----------------------------------- +// Comparison and Selection +//----------------------------------- + +multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, + Instruction setp_16rr, + Instruction setp_16ri, + Instruction setp_16ir, + Instruction setp_32rr, + Instruction setp_32ri, + Instruction setp_32ir, + Instruction setp_64rr, + Instruction setp_64ri, + Instruction setp_64ir, + Instruction set_16rr, + Instruction set_16ri, + Instruction set_16ir, + Instruction set_32rr, + Instruction set_32ri, + Instruction set_32ir, + Instruction set_64rr, + Instruction set_64ri, + Instruction set_64ir> { + // i16 -> pred + def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)), + (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), + (setp_16ri Int16Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), + (setp_16ir imm:$a, Int16Regs:$b, Mode)>; + // i32 -> pred + def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)), + (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), + (setp_32ri Int32Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), + (setp_32ir imm:$a, Int32Regs:$b, Mode)>; + // i64 -> pred + def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), + (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), + (setp_64ri Int64Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), + (setp_64ir imm:$a, Int64Regs:$b, Mode)>; + + // i16 -> i32 + def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)), + (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), + (set_16ri Int16Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), + (set_16ir imm:$a, Int16Regs:$b, Mode)>; + // i32 -> i32 + def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)), + (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), + (set_32ri Int32Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), + (set_32ir imm:$a, Int32Regs:$b, Mode)>; + // i64 -> i32 + def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), + (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), + (set_64ri Int64Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), + (set_64ir imm:$a, Int64Regs:$b, Mode)>; +} + +multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> + : ISET_FORMAT<OpNode, Mode, + SETP_s16rr, SETP_s16ri, SETP_s16ir, + SETP_s32rr, SETP_s32ri, SETP_s32ir, + SETP_s64rr, SETP_s64ri, SETP_s64ir, + SET_s16rr, SET_s16ri, SET_s16ir, + SET_s32rr, SET_s32ri, SET_s32ir, + SET_s64rr, SET_s64ri, SET_s64ir> { + // TableGen doesn't like empty multiclasses. + def : PatLeaf<(i32 0)>; +} + +multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> + : ISET_FORMAT<OpNode, Mode, + SETP_u16rr, SETP_u16ri, SETP_u16ir, + SETP_u32rr, SETP_u32ri, SETP_u32ir, + SETP_u64rr, SETP_u64ri, SETP_u64ir, + SET_u16rr, SET_u16ri, SET_u16ir, + SET_u32rr, SET_u32ri, SET_u32ir, + SET_u64rr, SET_u64ri, SET_u64ir> { + // TableGen doesn't like empty multiclasses. + def : PatLeaf<(i32 0)>; +} + +defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; +defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; +defm : ISET_FORMAT_SIGNED<setge, CmpGE>; +defm : ISET_FORMAT_SIGNED<setle, CmpLE>; +defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; +defm : ISET_FORMAT_SIGNED<setne, CmpNE>; +defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; +defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; +defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; +defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; +defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; +defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; + +// i1 compares +def : Pat<(setne Int1Regs:$a, Int1Regs:$b), + (XORb1rr Int1Regs:$a, Int1Regs:$b)>; +def : Pat<(setune Int1Regs:$a, Int1Regs:$b), + (XORb1rr Int1Regs:$a, Int1Regs:$b)>; + +def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), + (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), + (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; + +// i1 compare -> i32 +def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), + (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), + (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; + + + +multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { + // f16 -> pred + def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), + (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), + (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), + (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), + (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + + // f32 -> pred + def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), + (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), + (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), + (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), + (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + + // f64 -> pred + def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), + (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), + (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), + (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; + + // f16 -> i32 + def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), + (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), + (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), + (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), + (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + + // f32 -> i32 + def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), + (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), + (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), + (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), + (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + + // f64 -> i32 + def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), + (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), + (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), + (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; +} + +defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; +defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; +defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; +defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; +defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; +defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; + +defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; +defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; +defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; +defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; +defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; +defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; + +defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; +defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; +defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; +defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; +defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; +defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; + +defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; +defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; + +// FIXME: What is this doing here? Can it be deleted? +// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, +// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDTDeclareParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def SDTDeclareScalarParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; +def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; +def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; +def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; +def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; +def SDTCallValProfile : SDTypeProfile<1, 0, []>; +def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; +def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; +def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; +def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; + +def DeclareParam : + SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareScalarParam : + SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRetParam : + SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRet : + SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LoadParam : + SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV2 : + SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV4 : + SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def PrintCall : + SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCall : + SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintCallUni : + SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCallUni : + SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParam : + SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV2 : + SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV4 : + SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamU32 : + SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamS32 : + SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgBegin : + SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArg : + SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LastCallArg : + SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgEnd : + SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVoid : + SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def Prototype : + SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVal : + SDNode<"NVPTXISD::CallVal", SDTCallValProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def MoveParam : + SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; +def StoreRetval : + SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV2 : + SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV4 : + SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, + [SDNPHasChain, SDNPSideEffect]>; +def PseudoUseParam : + SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def RETURNNode : + SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPSideEffect]>; + +let mayLoad = 1 in { + class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), + []>; + + class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), + !strconcat("ld.param.v2", opstr, + " \t{{$dst, $dst2}}, [retval0+$b];"), []>; + + class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins i32imm:$b), + !strconcat("ld.param.v4", opstr, + " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), + []>; +} + +class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat("mov", opstr, " \t$dst, retval$b;"), + [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; + +let mayStore = 1 in { + class StoreParamInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), + []>; + + class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, + i32imm:$a, i32imm:$b), + !strconcat("st.param.v2", opstr, + " \t[param$a+$b], {{$val, $val2}};"), + []>; + + class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a, + i32imm:$b), + !strconcat("st.param.v4", opstr, + " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), + []>; + + class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), + !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), + []>; + + class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), + !strconcat("st.param.v2", opstr, + " \t[func_retval0+$a], {{$val, $val2}};"), + []>; + + class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a), + !strconcat("st.param.v4", opstr, + " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), + []>; +} + +let isCall=1 in { + multiclass CALL<string OpcStr, SDNode OpNode> { + def PrintCallNoRetInst : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; + def PrintCallRetInst1 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; + def PrintCallRetInst2 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; + def PrintCallRetInst3 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; + def PrintCallRetInst4 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), + [(OpNode (i32 4))]>; + def PrintCallRetInst5 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), + [(OpNode (i32 5))]>; + def PrintCallRetInst6 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5), "), + [(OpNode (i32 6))]>; + def PrintCallRetInst7 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6), "), + [(OpNode (i32 7))]>; + def PrintCallRetInst8 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6, retval7), "), + [(OpNode (i32 8))]>; + } +} + +defm Call : CALL<"call", PrintCall>; +defm CallUni : CALL<"call.uni", PrintCallUni>; + +// Convergent call instructions. These are identical to regular calls, except +// they have the isConvergent bit set. +let isConvergent=1 in { + defm ConvergentCall : CALL<"call", PrintConvergentCall>; + defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; +} + +def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; +def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; +def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; +def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; +def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; +def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; +def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; +def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; +def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; +def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; +def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; +def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">; +def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">; +def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; +def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; +def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">; +def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">; +def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; +def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; +def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">; +def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">; +def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; + +def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; +def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; + +def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; +def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; +def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; +def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; +def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; +def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; + +def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; +def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; +def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; + +def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">; +def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">; +def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; +def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; +def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">; +def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">; +def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; +def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; +def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">; +def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">; +def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; + +def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; +def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; +def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; +def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; +def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; +def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; +def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; +def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; +def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; +def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; +def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; + +def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; +def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; +def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">; +def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">; +def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; +def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; +def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">; +def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">; +def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; +def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">; +def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">; + +def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; +def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; +def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; +def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; + +class CallArgInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$a), "$a, ", + [(CallArg (i32 0), regclass:$a)]>; + +class LastCallArgInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$a), "$a", + [(LastCallArg (i32 0), regclass:$a)]>; + +def CallArgI64 : CallArgInst<Int64Regs>; +def CallArgI32 : CallArgInst<Int32Regs>; +def CallArgI16 : CallArgInst<Int16Regs>; +def CallArgF64 : CallArgInst<Float64Regs>; +def CallArgF32 : CallArgInst<Float32Regs>; + +def LastCallArgI64 : LastCallArgInst<Int64Regs>; +def LastCallArgI32 : LastCallArgInst<Int32Regs>; +def LastCallArgI16 : LastCallArgInst<Int16Regs>; +def LastCallArgF64 : LastCallArgInst<Float64Regs>; +def LastCallArgF32 : LastCallArgInst<Float32Regs>; + +def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", + [(CallArg (i32 0), (i32 imm:$a))]>; +def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", + [(LastCallArg (i32 0), (i32 imm:$a))]>; + +def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", + [(CallArg (i32 1), (i32 imm:$a))]>; +def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", + [(LastCallArg (i32 1), (i32 imm:$a))]>; + +def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", + [(CallVoid (Wrapper tglobaladdr:$addr))]>; +def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", + [(CallVoid Int32Regs:$addr)]>; +def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", + [(CallVoid Int64Regs:$addr)]>; +def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", + [(Prototype (i32 imm:$val))]>; + +def DeclareRetMemInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), + ".param .align $align .b8 retval$num[$size];", + [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetScalarInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".param .b$size retval$num;", + [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetRegInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".reg .b$size retval$num;", + [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; + +def DeclareParamInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), + ".param .align $align .b8 param$a[$size];", + [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; +def DeclareScalarParamInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".param .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; +def DeclareScalarRegInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".reg .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; + +class MoveParamInst<NVPTXRegClass regclass, string asmstr> : + NVPTXInst<(outs regclass:$dst), (ins regclass:$src), + !strconcat("mov", asmstr, " \t$dst, $src;"), + [(set regclass:$dst, (MoveParam regclass:$src))]>; + +def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">; +def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">; +def MoveParamI16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.u16.u32 \t$dst, $src;", + [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; +def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">; +def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">; +def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">; + +class PseudoUseParamInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$src), + "// Pseudo use of $src", + [(PseudoUseParam regclass:$src)]>; + +def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; +def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; +def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>; +def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>; +def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; + + +// +// Load / Store Handling +// +multiclass LD<NVPTXRegClass regclass> { + def _avar : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg_64 : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _ari : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _ari_64 : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _asi : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; +} + +let mayLoad=1, hasSideEffects=0 in { + defm LD_i8 : LD<Int16Regs>; + defm LD_i16 : LD<Int16Regs>; + defm LD_i32 : LD<Int32Regs>; + defm LD_i64 : LD<Int64Regs>; + defm LD_f16 : LD<Float16Regs>; + defm LD_f16x2 : LD<Float16x2Regs>; + defm LD_f32 : LD<Float32Regs>; + defm LD_f64 : LD<Float64Regs>; +} + +multiclass ST<NVPTXRegClass regclass> { + def _avar : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg_64 : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _ari : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _ari_64 : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _asi : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; +} + +let mayStore=1, hasSideEffects=0 in { + defm ST_i8 : ST<Int16Regs>; + defm ST_i16 : ST<Int16Regs>; + defm ST_i32 : ST<Int32Regs>; + defm ST_i64 : ST<Int64Regs>; + defm ST_f16 : ST<Float16Regs>; + defm ST_f16x2 : ST<Float16x2Regs>; + defm ST_f32 : ST<Float32Regs>; + defm ST_f64 : ST<Float64Regs>; +} + +// The following is used only in and after vector elementizations. Vector +// elementization happens at the machine instruction level, so the following +// instructions never appear in the DAG. +multiclass LD_VEC<NVPTXRegClass regclass> { + def _v2_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v4_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; +} +let mayLoad=1, hasSideEffects=0 in { + defm LDV_i8 : LD_VEC<Int16Regs>; + defm LDV_i16 : LD_VEC<Int16Regs>; + defm LDV_i32 : LD_VEC<Int32Regs>; + defm LDV_i64 : LD_VEC<Int64Regs>; + defm LDV_f16 : LD_VEC<Float16Regs>; + defm LDV_f16x2 : LD_VEC<Float16x2Regs>; + defm LDV_f32 : LD_VEC<Float32Regs>; + defm LDV_f64 : LD_VEC<Float64Regs>; +} + +multiclass ST_VEC<NVPTXRegClass regclass> { + def _v2_avar : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_ari : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_ari_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_asi : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v4_avar : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_asi : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; +} + +let mayStore=1, hasSideEffects=0 in { + defm STV_i8 : ST_VEC<Int16Regs>; + defm STV_i16 : ST_VEC<Int16Regs>; + defm STV_i32 : ST_VEC<Int32Regs>; + defm STV_i64 : ST_VEC<Int64Regs>; + defm STV_f16 : ST_VEC<Float16Regs>; + defm STV_f16x2 : ST_VEC<Float16x2Regs>; + defm STV_f32 : ST_VEC<Float32Regs>; + defm STV_f64 : ST_VEC<Float64Regs>; +} + +//---- Conversion ---- + +class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn, + NVPTXRegClass regclassOut> : + NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), + !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")), + [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; + +def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>; +def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>; +def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; +def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; +def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; +def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; +def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>; +def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>; + +// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where +// we cannot specify floating-point literals in isel patterns. Therefore, we +// use an integer selp to select either 1 or 0 and then cvt to floating-point. + +// sint -> f16 +def : Pat<(f16 (sint_to_fp Int1Regs:$a)), + (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (sint_to_fp Int16Regs:$a)), + (CVT_f16_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f16 (sint_to_fp Int32Regs:$a)), + (CVT_f16_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f16 (sint_to_fp Int64Regs:$a)), + (CVT_f16_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f16 +def : Pat<(f16 (uint_to_fp Int1Regs:$a)), + (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (uint_to_fp Int16Regs:$a)), + (CVT_f16_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f16 (uint_to_fp Int32Regs:$a)), + (CVT_f16_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f16 (uint_to_fp Int64Regs:$a)), + (CVT_f16_u64 Int64Regs:$a, CvtRN)>; + +// sint -> f32 +def : Pat<(f32 (sint_to_fp Int1Regs:$a)), + (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f32 (sint_to_fp Int16Regs:$a)), + (CVT_f32_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f32 (sint_to_fp Int32Regs:$a)), + (CVT_f32_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f32 (sint_to_fp Int64Regs:$a)), + (CVT_f32_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f32 +def : Pat<(f32 (uint_to_fp Int1Regs:$a)), + (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f32 (uint_to_fp Int16Regs:$a)), + (CVT_f32_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f32 (uint_to_fp Int32Regs:$a)), + (CVT_f32_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f32 (uint_to_fp Int64Regs:$a)), + (CVT_f32_u64 Int64Regs:$a, CvtRN)>; + +// sint -> f64 +def : Pat<(f64 (sint_to_fp Int1Regs:$a)), + (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f64 (sint_to_fp Int16Regs:$a)), + (CVT_f64_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f64 (sint_to_fp Int32Regs:$a)), + (CVT_f64_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f64 (sint_to_fp Int64Regs:$a)), + (CVT_f64_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f64 +def : Pat<(f64 (uint_to_fp Int1Regs:$a)), + (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f64 (uint_to_fp Int16Regs:$a)), + (CVT_f64_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f64 (uint_to_fp Int32Regs:$a)), + (CVT_f64_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f64 (uint_to_fp Int64Regs:$a)), + (CVT_f64_u64 Int64Regs:$a, CvtRN)>; + + +// f16 -> sint +def : Pat<(i1 (fp_to_sint Float16Regs:$a)), + (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float16Regs:$a)), + (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_sint Float16Regs:$a)), + (CVT_s16_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float16Regs:$a)), + (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_sint Float16Regs:$a)), + (CVT_s32_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float16Regs:$a)), + (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_sint Float16Regs:$a)), + (CVT_s64_f16 Float16Regs:$a, CvtRZI)>; + +// f16 -> uint +def : Pat<(i1 (fp_to_uint Float16Regs:$a)), + (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float16Regs:$a)), + (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_uint Float16Regs:$a)), + (CVT_u16_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float16Regs:$a)), + (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_uint Float16Regs:$a)), + (CVT_u32_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float16Regs:$a)), + (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_uint Float16Regs:$a)), + (CVT_u64_f16 Float16Regs:$a, CvtRZI)>; + +// f32 -> sint +def : Pat<(i1 (fp_to_sint Float32Regs:$a)), + (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float32Regs:$a)), + (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_sint Float32Regs:$a)), + (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float32Regs:$a)), + (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_sint Float32Regs:$a)), + (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float32Regs:$a)), + (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_sint Float32Regs:$a)), + (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; + +// f32 -> uint +def : Pat<(i1 (fp_to_uint Float32Regs:$a)), + (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float32Regs:$a)), + (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_uint Float32Regs:$a)), + (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float32Regs:$a)), + (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_uint Float32Regs:$a)), + (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float32Regs:$a)), + (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_uint Float32Regs:$a)), + (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; + +// f64 -> sint +def : Pat<(i1 (fp_to_sint Float64Regs:$a)), + (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float64Regs:$a)), + (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float64Regs:$a)), + (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float64Regs:$a)), + (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; + +// f64 -> uint +def : Pat<(i1 (fp_to_uint Float64Regs:$a)), + (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float64Regs:$a)), + (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float64Regs:$a)), + (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float64Regs:$a)), + (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; + +// sext i1 +def : Pat<(i16 (sext Int1Regs:$a)), + (SELP_s16ii -1, 0, Int1Regs:$a)>; +def : Pat<(i32 (sext Int1Regs:$a)), + (SELP_s32ii -1, 0, Int1Regs:$a)>; +def : Pat<(i64 (sext Int1Regs:$a)), + (SELP_s64ii -1, 0, Int1Regs:$a)>; + +// zext i1 +def : Pat<(i16 (zext Int1Regs:$a)), + (SELP_u16ii 1, 0, Int1Regs:$a)>; +def : Pat<(i32 (zext Int1Regs:$a)), + (SELP_u32ii 1, 0, Int1Regs:$a)>; +def : Pat<(i64 (zext Int1Regs:$a)), + (SELP_u64ii 1, 0, Int1Regs:$a)>; + +// anyext i1 +def : Pat<(i16 (anyext Int1Regs:$a)), + (SELP_u16ii -1, 0, Int1Regs:$a)>; +def : Pat<(i32 (anyext Int1Regs:$a)), + (SELP_u32ii -1, 0, Int1Regs:$a)>; +def : Pat<(i64 (anyext Int1Regs:$a)), + (SELP_u64ii -1, 0, Int1Regs:$a)>; + +// sext i16 +def : Pat<(i32 (sext Int16Regs:$a)), + (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (sext Int16Regs:$a)), + (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; + +// zext i16 +def : Pat<(i32 (zext Int16Regs:$a)), + (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (zext Int16Regs:$a)), + (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; + +// anyext i16 +def : Pat<(i32 (anyext Int16Regs:$a)), + (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (anyext Int16Regs:$a)), + (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; + +// sext i32 +def : Pat<(i64 (sext Int32Regs:$a)), + (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; + +// zext i32 +def : Pat<(i64 (zext Int32Regs:$a)), + (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; + +// anyext i32 +def : Pat<(i64 (anyext Int32Regs:$a)), + (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; + + +// truncate i64 +def : Pat<(i32 (trunc Int64Regs:$a)), + (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; +def : Pat<(i16 (trunc Int64Regs:$a)), + (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; +def : Pat<(i1 (trunc Int64Regs:$a)), + (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; + +// truncate i32 +def : Pat<(i16 (trunc Int32Regs:$a)), + (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; +def : Pat<(i1 (trunc Int32Regs:$a)), + (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; + +// truncate i16 +def : Pat<(i1 (trunc Int16Regs:$a)), + (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; + +// sext_inreg +def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; +def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; +def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; + + +// Select instructions with 32-bit predicates +def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), + (SELP_b16rr Int16Regs:$a, Int16Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), + (SELP_b32rr Int32Regs:$a, Int32Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), + (SELP_b64rr Int64Regs:$a, Int64Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b), + (SELP_f16rr Float16Regs:$a, Float16Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), + (SELP_f32rr Float32Regs:$a, Float32Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), + (SELP_f64rr Float64Regs:$a, Float64Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; + + +let hasSideEffects = 0 in { + // pack a set of smaller int registers to a larger int register + def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2, + Int16Regs:$s3, Int16Regs:$s4), + "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; + def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2), + "mov.b32 \t$d, {{$s1, $s2}};", []>; + def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int32Regs:$s1, Int32Regs:$s2), + "mov.b64 \t$d, {{$s1, $s2}};", []>; + def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), + (ins Float32Regs:$s1, Float32Regs:$s2), + "mov.b64 \t$d, {{$s1, $s2}};", []>; + + // unpack a larger int register to a set of smaller int registers + def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, + Int16Regs:$d3, Int16Regs:$d4), + (ins Int64Regs:$s), + "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; + def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), + (ins Int32Regs:$s), + "mov.b32 \t{{$d1, $d2}}, $s;", []>; + def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), + (ins Int64Regs:$s), + "mov.b64 \t{{$d1, $d2}}, $s;", []>; + def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), + (ins Float64Regs:$s), + "mov.b64 \t{{$d1, $d2}}, $s;", []>; + +} + +let hasSideEffects = 0 in { + // Extract element of f16x2 register. PTX does not provide any way + // to access elements of f16x2 vector directly, so we need to + // extract it using a temporary register. + def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_hi;\n\t" + " mov.b32 \t{$dst, %tmp_hi}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 0))]>; + def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_lo;\n\t" + " mov.b32 \t{%tmp_lo, $dst}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 1))]>; + + // Coalesce two f16 registers into f16x2 + def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + "mov.b32 \t$dst, {{$a, $b}};", + [(set Float16x2Regs:$dst, + (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; + + // Directly initializing underlying the b32 register is one less SASS + // instruction than than vector-packing move. + def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), + "mov.b32 \t$dst, $src;", + []>; + + // Split f16x2 into two f16 registers. + def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Float16x2Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; + // Split an i32 into two f16 + def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Int32Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; +} + +// Count leading zeros +let hasSideEffects = 0 in { + def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "clz.b32 \t$d, $a;", []>; + def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "clz.b64 \t$d, $a;", []>; +} + +// 32-bit has a direct PTX instruction +def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; + +// The return type of the ctlz ISD node is the same as its input, but the PTX +// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the +// ptx value to 64 bits to match the ISD node's semantics, unless we know we're +// truncating back down to 32 bits. +def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; + +// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the +// result back to 16-bits if necessary. We also need to subtract 16 because +// the high-order 16 zeros were counted. +// +// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could +// use to save one SASS instruction (on sm_35 anyway): +// +// mov.b32 $tmp, {0xffff, $a} +// ctlz.b32 $result, $tmp +// +// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" +// and then ctlz that value. This way we don't have to subtract 16 from the +// result. Unfortunately today we don't have a way to generate +// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. +def : Pat<(ctlz Int16Regs:$a), + (SUBi16ri (CVT_u16_u32 + (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; +def : Pat<(i32 (zext (ctlz Int16Regs:$a))), + (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; + +// Population count +let hasSideEffects = 0 in { + def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "popc.b32 \t$d, $a;", []>; + def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "popc.b64 \t$d, $a;", []>; +} + +// 32-bit has a direct PTX instruction +def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; + +// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit +// to match the LLVM semantics. Just as with ctlz.i64, we provide a second +// pattern that avoids the type conversion if we're truncating the result to +// i32 anyway. +def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; + +// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. +// If we know that we're storing into an i32, we can avoid the final trunc. +def : Pat<(ctpop Int16Regs:$a), + (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; +def : Pat<(i32 (zext (ctpop Int16Regs:$a))), + (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; + +// fpround f32 -> f16 +def : Pat<(f16 (fpround Float32Regs:$a)), + (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fpround Float32Regs:$a)), + (CVT_f16_f32 Float32Regs:$a, CvtRN)>; + +// fpround f64 -> f16 +def : Pat<(f16 (fpround Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fpround Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN)>; + +// fpround f64 -> f32 +def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN)>; + +// fpextend f16 -> f32 +def : Pat<(f32 (fpextend Float16Regs:$a)), + (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend Float16Regs:$a)), + (CVT_f32_f16 Float16Regs:$a, CvtNONE)>; + +// fpextend f16 -> f64 +def : Pat<(f64 (fpextend Float16Regs:$a)), + (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f64 (fpextend Float16Regs:$a)), + (CVT_f64_f16 Float16Regs:$a, CvtNONE)>; + +// fpextend f32 -> f64 +def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; + +def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +// fceil, ffloor, fround, ftrunc. + +def : Pat<(fceil Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fceil Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fceil Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fceil Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fceil Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRPI)>; + +def : Pat<(ffloor Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ffloor Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ffloor Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ffloor Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ffloor Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; + +def : Pat<(fround Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fround Float16Regs:$a)), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fround Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fround Float32Regs:$a)), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(f64 (fround Float64Regs:$a)), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + +def : Pat<(ftrunc Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ftrunc Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ftrunc Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ftrunc Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ftrunc Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRZI)>; + +// nearbyint and rint are implemented as rounding to nearest even. This isn't +// strictly correct, because it causes us to ignore the rounding mode. But it +// matches what CUDA's "libm" does. + +def : Pat<(fnearbyint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fnearbyint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fnearbyint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fnearbyint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fnearbyint Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + +def : Pat<(frint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(frint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(frint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(frint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(frint Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + + +//----------------------------------- +// Control-flow +//----------------------------------- + +let isTerminator=1 in { + let isReturn=1, isBarrier=1 in + def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; + + let isBranch=1 in + def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@$a bra \t$target;", + [(brcond Int1Regs:$a, bb:$target)]>; + let isBranch=1 in + def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@!$a bra \t$target;", []>; + + let isBranch=1, isBarrier=1 in + def GOTO : NVPTXInst<(outs), (ins brtarget:$target), + "bra.uni \t$target;", [(br bb:$target)]>; +} + +def : Pat<(brcond Int32Regs:$a, bb:$target), + (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; + +// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a +// conditional branch if the target block is the next block so that the code +// can fall through to the target block. The invertion is done by 'xor +// condition, 1', which will be translated to (setne condition, -1). Since ptx +// supports '@!pred bra target', we should use it. +def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), + (CBranchOther Int1Regs:$a, bb:$target)>; + +// Call +def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; + +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect]>; + +def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def calltarget : Operand<i32>; +let isCall=1 in { + def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; +} + +def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; +def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; + +// Pseudo instructions. +class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> + : NVPTXInst<outs, ins, asmstr, pattern>; + +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\{ // callseq $amt1, $amt2\n" + "\t.reg .b32 temp_param_reg;", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + +// trap instruction +def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; + +// Call prototype wrapper +def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def ProtoIdent : Operand<i32> { + let PrintMethod = "printProtoIdent"; +} +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; + + +include "NVPTXIntrinsics.td" + + +//----------------------------------- +// Notes +//----------------------------------- +// BSWAP is currently expanded. The following is a more efficient +// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register +// - for sm_20, use pmpt (use vector scalar mov to get the pack and +// unpack). sm_20 supports native 32-bit register, but not native 16-bit +// register. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8d228a9eeb74..c932758bd0ae 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -71,6 +71,38 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;", [(int_nvvm_bar_sync imm:$i)]>; +def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;", + [(int_nvvm_bar_warp_sync imm:$i)]>, + Requires<[hasPTX60, hasSM30]>; +def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;", + [(int_nvvm_bar_warp_sync Int32Regs:$i)]>, + Requires<[hasPTX60, hasSM30]>; + +def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;", + [(int_nvvm_barrier_sync imm:$i)]>, + Requires<[hasPTX60, hasSM30]>; +def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;", + [(int_nvvm_barrier_sync Int32Regs:$i)]>, + Requires<[hasPTX60, hasSM30]>; + +def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt Int32Regs:$id, Int32Regs:$cnt)]>, + Requires<[hasPTX60, hasSM30]>; +def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt Int32Regs:$id, imm:$cnt)]>, + Requires<[hasPTX60, hasSM30]>; +def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt imm:$id, Int32Regs:$cnt)]>, + Requires<[hasPTX60, hasSM30]>; +def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, + Requires<[hasPTX60, hasSM30]>; + + // shfl.{up,down,bfly,idx}.b32 multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { // The last two parameters to shfl can be regs or imms. ptxas is smart @@ -111,8 +143,168 @@ defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>; defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>; defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>; -} // isConvergent = 1 +multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { + // Threadmask and the last two parameters to shfl.sync can be regs or imms. + // ptxas is smart enough to inline constant registers, so strictly speaking we + // don't need to handle immediates here. But it's easy enough, and it makes + // our ptx more readable. + def rrr : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + Int32Regs:$offset, Int32Regs:$mask))]>; + + def rri : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + Int32Regs:$offset, imm:$mask))]>; + + def rir : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + imm:$offset, Int32Regs:$mask))]>; + + def rii : NVPTXInst< + (outs regclass:$dst), + (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, + imm:$offset, imm:$mask))]>; + + def irr : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + Int32Regs:$offset, Int32Regs:$mask))]>; + + def iri : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + Int32Regs:$offset, imm:$mask))]>; + + def iir : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + imm:$offset, Int32Regs:$mask))]>; + def iii : NVPTXInst< + (outs regclass:$dst), + (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), + !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), + [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, + imm:$offset, imm:$mask))]>; +} + +// On sm_70 these don't have to be convergent, so we may eventually want to +// implement non-convergent variant of this intrinsic. +defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>; +defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>; +defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>; +defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>; +defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>; +defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>; +defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>; +defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>; + + +// vote.{all,any,uni,ballot} +multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { + def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred), + "vote." # mode # " \t$dest, $pred;", + [(set regclass:$dest, (IntOp Int1Regs:$pred))]>, + Requires<[hasPTX60, hasSM30]>; +} + +defm VOTE_ALL : VOTE<Int1Regs, "all.pred", int_nvvm_vote_all>; +defm VOTE_ANY : VOTE<Int1Regs, "any.pred", int_nvvm_vote_any>; +defm VOTE_UNI : VOTE<Int1Regs, "uni.pred", int_nvvm_vote_uni>; +defm VOTE_BALLOT : VOTE<Int32Regs, "ballot.b32", int_nvvm_vote_ballot>; + +// vote.sync.{all,any,uni,ballot} +multiclass VOTE_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { + def i : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, Int1Regs:$pred), + "vote.sync." # mode # " \t$dest, $pred, $mask;", + [(set regclass:$dest, (IntOp imm:$mask, Int1Regs:$pred))]>, + Requires<[hasPTX60, hasSM30]>; + def r : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, Int1Regs:$pred), + "vote.sync." # mode #" \t$dest, $pred, $mask;", + [(set regclass:$dest, (IntOp Int32Regs:$mask, Int1Regs:$pred))]>, + Requires<[hasPTX60, hasSM30]>; +} + +defm VOTE_SYNC_ALL : VOTE_SYNC<Int1Regs, "all.pred", int_nvvm_vote_all_sync>; +defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>; +defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>; +defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>; + +multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, + Operand ImmOp> { + def ii : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, ImmOp:$value), + "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", + [(set regclass:$dest, (IntOp imm:$mask, imm:$value))]>, + Requires<[hasPTX60, hasSM70]>; + def ir : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, ImmOp:$value), + "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", + [(set regclass:$dest, (IntOp Int32Regs:$mask, imm:$value))]>, + Requires<[hasPTX60, hasSM70]>; + def ri : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, regclass:$value), + "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", + [(set regclass:$dest, (IntOp imm:$mask, regclass:$value))]>, + Requires<[hasPTX60, hasSM70]>; + def rr : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, regclass:$value), + "match.any.sync." # ptxtype # " \t$dest, $value, $mask;", + [(set regclass:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>, + Requires<[hasPTX60, hasSM70]>; +} + +defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32, + i32imm>; +defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64, + i64imm>; + +multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp, + Operand ImmOp> { + def ii : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + (ins i32imm:$mask, ImmOp:$value), + "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", + // If would be nice if tablegen could match multiple return values, + // but it does not seem to be the case. Thus we have an empty pattern and + // lower intrinsic to instruction manually. + // [(set regclass:$dest, Int1Regs:$pred, (IntOp imm:$value, imm:$mask))]>, + []>, + Requires<[hasPTX60, hasSM70]>; + def ir : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + (ins Int32Regs:$mask, ImmOp:$value), + "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", + []>, + Requires<[hasPTX60, hasSM70]>; + def ri : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + (ins i32imm:$mask, regclass:$value), + "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", + []>, + Requires<[hasPTX60, hasSM70]>; + def rr : NVPTXInst<(outs regclass:$dest, Int1Regs:$pred), + (ins Int32Regs:$mask, regclass:$value), + "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;", + []>, + Requires<[hasPTX60, hasSM70]>; +} +defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p, + i32imm>; +defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p, + i64imm>; + +} // isConvergent = 1 //----------------------------------- // Explicit Memory Fence Functions @@ -787,6 +979,33 @@ def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs, def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs, Float64Regs, int_nvvm_bitcast_d2ll>; +// +// FNS +// + +class INT_FNS_MBO<dag ins, dag Operands> + : NVPTXInst<(outs Int32Regs:$dst), ins, + "fns.b32 \t$dst, $mask, $base, $offset;", + [(set Int32Regs:$dst, Operands )]>, + Requires<[hasPTX60, hasSM30]>; + +def INT_FNS_rrr : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset), + (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset)>; +def INT_FNS_rri : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base, i32imm:$offset), + (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base, imm:$offset)>; +def INT_FNS_rir : INT_FNS_MBO<(ins Int32Regs:$mask, i32imm:$base, Int32Regs:$offset), + (int_nvvm_fns Int32Regs:$mask, imm:$base, Int32Regs:$offset)>; +def INT_FNS_rii : INT_FNS_MBO<(ins Int32Regs:$mask, i32imm:$base, i32imm:$offset), + (int_nvvm_fns Int32Regs:$mask, imm:$base, imm:$offset)>; +def INT_FNS_irr : INT_FNS_MBO<(ins i32imm:$mask, Int32Regs:$base, Int32Regs:$offset), + (int_nvvm_fns imm:$mask, Int32Regs:$base, Int32Regs:$offset)>; +def INT_FNS_iri : INT_FNS_MBO<(ins i32imm:$mask, Int32Regs:$base, i32imm:$offset), + (int_nvvm_fns imm:$mask, Int32Regs:$base, imm:$offset)>; +def INT_FNS_iir : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, Int32Regs:$offset), + (int_nvvm_fns imm:$mask, imm:$base, Int32Regs:$offset)>; +def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$offset), + (int_nvvm_fns imm:$mask, imm:$base, imm:$offset)>; + //----------------------------------- // Atomic Functions //----------------------------------- @@ -903,6 +1122,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; +def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add", atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>; @@ -929,6 +1154,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add", defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add", atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>; +defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add", + atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>; +defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add", + atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>; +defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add", + atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>; + // atom_sub def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), @@ -7176,3 +7408,208 @@ def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>; def INT_PTX_SREG_WARPSIZE : NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;", [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>; + +// +// wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32] +// +class WMMA_LOAD_ALSTOS<string Abc, string Layout, string Space, + string Type, NVPTXRegClass regclass, + Operand SrcOp, int WithOffset, int WithStride> + : NVPTXInst<!if(!eq(Abc#Type,"cf16"), + (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3), + (outs regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)), + !if(WithStride, + !if(WithOffset, + (ins SrcOp:$src, i32imm:$offset, Int32Regs:$ldm), + (ins SrcOp:$src, Int32Regs:$ldm)), + !if(WithOffset, + (ins SrcOp:$src, i32imm:$offset), + (ins SrcOp:$src))), + "wmma.load."#Abc#".sync."#Layout#".m16n16k16"#Space#"." #Type# " \t" + #!if(!eq(Abc#Type,"cf16"), + "{{$r0, $r1, $r2, $r3}}", + "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}") + #", " + #!if(WithOffset,"[$src+$offset]", "[$src]") + #!if(WithStride, ", $ldm", "") + #";", + []>, + Requires<[hasPTX60, hasSM70]>; + +multiclass WMMA_LOAD_ALSTO<string Abc, string Layout, string Space, + string Type, NVPTXRegClass regclass, + Operand SrcOp, int WithOffset = 0> { + def _stride: WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp, + WithOffset, 1>; + def NAME: WMMA_LOAD_ALSTOS<Abc, Layout, Space, Type, regclass, SrcOp, + WithOffset, 0>; +} + +multiclass WMMA_LOAD_ALST<string Abc, string Layout, string Space, + string Type, NVPTXRegClass regclass> { + defm _avar: WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 0>; + defm _ari64: WMMA_LOAD_ALSTO<Abc, Layout, Space, Type, regclass, imemAny, 1>; +} + +multiclass WMMA_LOAD_ALT<string Abc, string Layout, + string Type, NVPTXRegClass regclass> { + defm _global: WMMA_LOAD_ALST<Abc, Layout, ".global", Type, regclass>; + defm _shared: WMMA_LOAD_ALST<Abc, Layout, ".shared", Type, regclass>; + defm NAME: WMMA_LOAD_ALST<Abc, Layout, "", Type, regclass>; +} + +multiclass WMMA_LOAD_AT<string Abc, string Type, NVPTXRegClass regclass> { + defm _row: WMMA_LOAD_ALT<Abc, "row", Type, regclass>; + defm _col: WMMA_LOAD_ALT<Abc, "col", Type, regclass>; +} + +defm INT_WMMA_LOAD_A: WMMA_LOAD_AT<"a", "f16", Float16x2Regs>; +defm INT_WMMA_LOAD_B: WMMA_LOAD_AT<"b", "f16", Float16x2Regs>; +defm INT_WMMA_LOAD_C_f16: WMMA_LOAD_AT<"c", "f16", Float16x2Regs>; +defm INT_WMMA_LOAD_C_f32: WMMA_LOAD_AT<"c", "f32", Float32Regs>; + +// +// wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32] +// +class WMMA_STORE_D_LSTOS<string Layout, string Space, + string Type, NVPTXRegClass regclass, + Operand DstOp, int WithOffset, int WithStride> + : NVPTXInst<(outs), + !if(!eq(Type,"f16"), + !if(WithStride, + !if(WithOffset, + (ins DstOp:$src, i32imm:$offset, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + Int32Regs:$ldm), + (ins DstOp:$src, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + Int32Regs:$ldm)), + !if(WithOffset, + (ins DstOp:$src, i32imm:$offset, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3), + (ins DstOp:$src, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3))), + !if(WithStride, + !if(WithOffset, + (ins DstOp:$src, i32imm:$offset, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7, + Int32Regs:$ldm), + (ins DstOp:$src, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7, + Int32Regs:$ldm)), + !if(WithOffset, + (ins DstOp:$src, i32imm:$offset, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7), + (ins DstOp:$src, + regclass:$r0, regclass:$r1, regclass:$r2, regclass:$r3, + regclass:$r4, regclass:$r5, regclass:$r6, regclass:$r7)))), + "wmma.store.d.sync."#Layout#".m16n16k16"#Space#"." #Type# " \t" + #!if(WithOffset,"[$src+$offset], ", "[$src], ") + #!if(!eq(Type,"f16"), + "{{$r0, $r1, $r2, $r3}}", + "{{$r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7}}") + #!if(WithStride, ", $ldm", "") + #";", + []>, + Requires<[hasPTX60, hasSM70]>; + +multiclass WMMA_STORE_D_LSTO<string Layout, string Space, + string Type, NVPTXRegClass regclass, + Operand DstOp, int WithOffset = 0> { + def _stride: WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp, + WithOffset, 1>; + def NAME: WMMA_STORE_D_LSTOS<Layout, Space, Type, regclass, DstOp, + WithOffset, 0>; +} + +multiclass WMMA_STORE_D_LST<string Layout, string Space, + string Type, NVPTXRegClass regclass> { + defm _avar: WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 0>; + defm _ari64: WMMA_STORE_D_LSTO<Layout, Space, Type, regclass, imemAny, 1>; +} + +multiclass WMMA_STORE_D_LT<string Layout, + string Type, NVPTXRegClass regclass> { + defm _global: WMMA_STORE_D_LST<Layout, ".global", Type, regclass>; + defm _shared: WMMA_STORE_D_LST<Layout, ".shared", Type, regclass>; + defm NAME: WMMA_STORE_D_LST<Layout, "", Type, regclass>; +} + +multiclass WMMA_STORE_D_T<string Type, NVPTXRegClass regclass> { + defm _row: WMMA_STORE_D_LT<"row", Type, regclass>; + defm _col: WMMA_STORE_D_LT<"col", Type, regclass>; +} + +defm INT_WMMA_STORE_D_f16: WMMA_STORE_D_T<"f16", Float16x2Regs>; +defm INT_WMMA_STORE_D_f32: WMMA_STORE_D_T<"f32", Float32Regs>; + +// WMMA.MMA +class WMMA_MMA_ABDCS<string ALayout, string BLayout, + string DType, NVPTXRegClass d_reg, + string CType, NVPTXRegClass c_reg, + NVPTXRegClass ab_reg, + string Satfinite = ""> + : NVPTXInst<!if(!eq(DType,"f16"), + (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3), + (outs d_reg:$d0, d_reg:$d1, d_reg:$d2, d_reg:$d3, + d_reg:$d4, d_reg:$d5, d_reg:$d6, d_reg:$d7)), + !if(!eq(CType,"f16"), + (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3, + ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7, + ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3, + ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7, + c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3), + (ins ab_reg:$a0, ab_reg:$a1, ab_reg:$a2, ab_reg:$a3, + ab_reg:$a4, ab_reg:$a5, ab_reg:$a6, ab_reg:$a7, + ab_reg:$b0, ab_reg:$b1, ab_reg:$b2, ab_reg:$b3, + ab_reg:$b4, ab_reg:$b5, ab_reg:$b6, ab_reg:$b7, + c_reg:$c0, c_reg:$c1, c_reg:$c2, c_reg:$c3, + c_reg:$c4, c_reg:$c5, c_reg:$c6, c_reg:$c7)), + "wmma.mma.sync."#ALayout#"."#BLayout#".m16n16k16."# + #DType#"."#CType#Satfinite + #"\n\t\t" + #!if(!eq(DType,"f16"), + "{{$d0, $d1, $d2, $d3}}, \n\t\t", + "{{$d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7}},\n\t\t") + #"{{$a0, $a1, $a2, $a3, $a4, $a5, $a6, $a7}},\n\t\t" + #"{{$b0, $b1, $b2, $b3, $b4, $b5, $b6, $b7}},\n\t\t" + #!if(!eq(CType,"f16"), + "{{$c0, $c1, $c2, $c3}};", + "{{$c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7}};"), + []>, + Requires<[hasPTX60, hasSM70]>; + +multiclass WMMA_MMA_ABDC<string ALayout, string BLayout, + string DType, NVPTXRegClass d_reg, + string CType, NVPTXRegClass c_reg> { + def _satfinite: WMMA_MMA_ABDCS<ALayout, BLayout, + DType, d_reg, CType, c_reg, + Float16x2Regs, ".satfinite">; + def NAME: WMMA_MMA_ABDCS<ALayout, BLayout, + DType, d_reg, CType, c_reg, + Float16x2Regs>; +} + +multiclass WMMA_MMA_ABD<string ALayout, string BLayout, + string DType, NVPTXRegClass d_reg> { + defm _f16: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f16", Float16x2Regs>; + defm _f32: WMMA_MMA_ABDC<ALayout, BLayout, DType, d_reg, "f32", Float32Regs>; +} + +multiclass WMMA_MMA_AB<string ALayout, string BLayout> { + defm _f16: WMMA_MMA_ABD<ALayout, BLayout, "f16", Float16x2Regs>; + defm _f32: WMMA_MMA_ABD<ALayout, BLayout, "f32", Float32Regs>; +} + +multiclass WMMA_MMA_A<string ALayout> { + defm _col: WMMA_MMA_AB<ALayout, "col">; + defm _row: WMMA_MMA_AB<ALayout, "row">; +} + +defm INT_WMMA_MMA_col: WMMA_MMA_A<"col">; +defm INT_WMMA_MMA_row: WMMA_MMA_A<"row">; + diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 989f0a3aba2f..52ced266b91c 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -111,23 +111,13 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { ConstantInt *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); - if (!TTI.useWideIRMemcpyLoopLowering()) { - createMemCpyLoop(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile()); - } else { - createMemCpyLoopKnownSize(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile(), TTI); - } + createMemCpyLoopKnownSize(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), TTI); SI->eraseFromParent(); LI->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp index 4e902c0fb507..02c32c68ee2c 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -22,11 +22,11 @@ // This peephole pass optimizes these cases, for example // // It will transform the following pattern -// %vreg0<def> = LEA_ADDRi64 %VRFrame, 4 -// %vreg1<def> = cvta_to_local_yes_64 %vreg0 +// %0 = LEA_ADDRi64 %VRFrame, 4 +// %1 = cvta_to_local_yes_64 %0 // // into -// %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4 +// %1 = LEA_ADDRi64 %VRFrameLocal, 4 // // %VRFrameLocal is the virtual register name of %SPL // @@ -36,8 +36,8 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" using namespace llvm; @@ -125,7 +125,7 @@ static void CombineCVTAToLocal(MachineInstr &Root) { } bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; bool Changed = false; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 88288abe64f9..1402033b9e60 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -17,12 +17,12 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 8d46694fbe50..755738329881 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -18,8 +18,8 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MachineLocation.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h index c310a9c1ad0c..6185a0b54cac 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H #include "ManagedStringPool.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include <sstream> #define GET_REGINFO_HEADER diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 2022caca76ee..82befe4b101b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -158,7 +158,7 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) { unsigned Param = atoi(Sym.data()+ParamBaseName.size()); std::string NewSym; raw_string_ostream NewSymStr(NewSym); - NewSymStr << MF.getFunction()->getName() << "_param_" << Param; + NewSymStr << MF.getName() << "_param_" << Param; InstrsToRemove.insert(&TexHandleDef); Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 96618cf46373..3a0bfd221b0b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -20,8 +20,8 @@ #include "NVPTXInstrInfo.h" #include "NVPTXRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <string> #define GET_SUBTARGETINFO_HEADER diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 2b6ba8c85d4d..85f757878f94 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -81,23 +81,28 @@ static std::string computeDataLayout(bool is64Bit) { if (!is64Bit) Ret += "-p:32:32"; - Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; return Ret; } +static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { + if (CM) + return *CM; + return CodeModel::Small; +} + NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, + Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool is64bit) // The pic relocation model is used regardless of what the client has // specified, as it is the only relocation model currently supported. : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, - Reloc::PIC_, CM, OL), - is64bit(is64bit), - TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), + Reloc::PIC_, getEffectiveCodeModel(CM), OL), + is64bit(is64bit), TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; @@ -114,8 +119,8 @@ NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void NVPTXTargetMachine64::anchor() {} @@ -124,8 +129,8 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} namespace { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index 2f3981be22f8..54a72a688ee3 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -17,7 +17,7 @@ #include "ManagedStringPool.h" #include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -36,7 +36,7 @@ class NVPTXTargetMachine : public LLVMTargetMachine { public: NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, CodeGenOpt::Level OP, bool is64bit); ~NVPTXTargetMachine() override; @@ -75,8 +75,8 @@ class NVPTXTargetMachine32 : public NVPTXTargetMachine { public: NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT); }; class NVPTXTargetMachine64 : public NVPTXTargetMachine { @@ -84,8 +84,8 @@ class NVPTXTargetMachine64 : public NVPTXTargetMachine { public: NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT); }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 69c59d0296ab..d16269f6ebea 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -11,9 +11,9 @@ #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H #include "NVPTXSection.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index a64d95512a4a..307654aed37f 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -13,9 +13,9 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/CostTable.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" using namespace llvm; #define DEBUG_TYPE "NVPTXtti" diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index f987892ba675..d2414b72a009 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -21,7 +21,7 @@ #include "NVPTXTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -63,6 +63,22 @@ public: void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { + // Volatile loads/stores are only supported for shared and global address + // spaces, or for generic AS that maps to them. + if (!(AddrSpace == llvm::ADDRESS_SPACE_GENERIC || + AddrSpace == llvm::ADDRESS_SPACE_GLOBAL || + AddrSpace == llvm::ADDRESS_SPACE_SHARED)) + return false; + + switch(I->getOpcode()){ + default: + return false; + case Instruction::Load: + case Instruction::Store: + return true; + } + } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td deleted file mode 100644 index e69bbba9f193..000000000000 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td +++ /dev/null @@ -1,1479 +0,0 @@ -//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//----------------------------------- -// Vector Specific -//----------------------------------- - -// -// All vector instructions derive from NVPTXVecInst -// - -class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern, - NVPTXInst sInst=NOP> - : NVPTXInst<outs, ins, asmstr, pattern> { - NVPTXInst scalarInst=sInst; -} - -let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in { -// Extract v2i16 -def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), - (ins V2I16Regs:$src, i8imm:$c), - "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (extractelt - (v2i16 V2I16Regs:$src), imm:$c))], - IMOV16rr>; - -// Extract v4i16 -def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), - (ins V4I16Regs:$src, i8imm:$c), - "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (extractelt - (v4i16 V4I16Regs:$src), imm:$c))], - IMOV16rr>; - -// Extract v2i8 -def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), - (ins V2I8Regs:$src, i8imm:$c), - "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (extractelt - (v2i8 V2I8Regs:$src), imm:$c))], - IMOV8rr>; - -// Extract v4i8 -def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), - (ins V4I8Regs:$src, i8imm:$c), - "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (extractelt - (v4i8 V4I8Regs:$src), imm:$c))], - IMOV8rr>; - -// Extract v2i32 -def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), - (ins V2I32Regs:$src, i8imm:$c), - "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (extractelt - (v2i32 V2I32Regs:$src), imm:$c))], - IMOV32rr>; - -// Extract v2f32 -def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), - (ins V2F32Regs:$src, i8imm:$c), - "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (extractelt - (v2f32 V2F32Regs:$src), imm:$c))], - FMOV32rr>; - -// Extract v2i64 -def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), - (ins V2I64Regs:$src, i8imm:$c), - "mov.u64 \t$dst, $src${c:vecelem};", - [(set Int64Regs:$dst, (extractelt - (v2i64 V2I64Regs:$src), imm:$c))], - IMOV64rr>; - -// Extract v2f64 -def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), - (ins V2F64Regs:$src, i8imm:$c), - "mov.f64 \t$dst, $src${c:vecelem};", - [(set Float64Regs:$dst, (extractelt - (v2f64 V2F64Regs:$src), imm:$c))], - FMOV64rr>; - -// Extract v4i32 -def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), - (ins V4I32Regs:$src, i8imm:$c), - "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (extractelt - (v4i32 V4I32Regs:$src), imm:$c))], - IMOV32rr>; - -// Extract v4f32 -def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), - (ins V4F32Regs:$src, i8imm:$c), - "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (extractelt - (v4f32 V4F32Regs:$src), imm:$c))], - FMOV32rr>; -} - -let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in { -// Insert v2i8 -def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst), - (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c), - "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u16 \t$dst${c:vecelem}, $val;", - [(set V2I8Regs:$dst, - (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; - -// Insert v4i8 -def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), - (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c), - "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u16 \t$dst${c:vecelem}, $val;", - [(set V4I8Regs:$dst, - (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; - -// Insert v2i16 -def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), - (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c), - "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u16 \t$dst${c:vecelem}, $val;", - [(set V2I16Regs:$dst, - (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; - -// Insert v4i16 -def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), - (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c), - "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u16 \t$dst${c:vecelem}, $val;", - [(set V4I16Regs:$dst, - (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; - -// Insert v2i32 -def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), - (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c), - "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u32 \t$dst${c:vecelem}, $val;", - [(set V2I32Regs:$dst, - (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; - -// Insert v2f32 -def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), - (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c), - "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.f32 \t$dst${c:vecelem}, $val;", - [(set V2F32Regs:$dst, - (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; - -// Insert v2i64 -def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), - (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c), - "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u64 \t$dst${c:vecelem}, $val;", - [(set V2I64Regs:$dst, - (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))], - IMOV64rr>; - -// Insert v2f64 -def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), - (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c), - "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.f64 \t$dst${c:vecelem}, $val;", - [(set V2F64Regs:$dst, - (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))], - FMOV64rr>; - -// Insert v4i32 -def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), - (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c), - "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.u32 \t$dst${c:vecelem}, $val;", - [(set V4I32Regs:$dst, - (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; - -// Insert v4f32 -def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), - (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c), - "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};" - "\n\tmov.f32 \t$dst${c:vecelem}, $val;", - [(set V4F32Regs:$dst, - (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; -} - -class BinOpAsmString<string c> { - string s = c; -} - -class V4AsmStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat(!strconcat( - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"), - opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"), - opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>; - -class V2AsmStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>; - -class V4MADStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat(!strconcat( - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"), - opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"), - opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>; - -class V2MADStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>; - -class V4UnaryStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat(!strconcat( - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1;\n\t"), - opcode), " \t${dst}_2, ${a}_2;\n\t"), - opcode), " \t${dst}_3, ${a}_3;")>; - -class V2UnaryStr<string opcode> : BinOpAsmString< - !strconcat(!strconcat(!strconcat( - opcode, " \t${dst}_0, ${a}_0;\n\t"), - opcode), " \t${dst}_1, ${a}_1;")>; - -class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass, - NVPTXInst sInst=NOP> : - NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b), - asmstr.s, - [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))], - sInst>; - -class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1, - NVPTXRegClass regclass2, NVPTXInst sInst=NOP> : - NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b), - asmstr.s, - [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))], - sInst>; - -class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass, - NVPTXInst sInst=NOP> : - NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a), - asmstr.s, - [(set regclass:$dst, (OpNode regclass:$a))], sInst>; - -multiclass IntBinVOp<string asmstr, SDNode OpNode, - NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst - i16op=NOP, NVPTXInst i8op=NOP> { - def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs, - i64op>; - def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs, - i32op>; - def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs, - i32op>; - def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs, - i16op>; - def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs, - i16op>; - def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs, - i8op>; - def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs, - i8op>; -} - -multiclass FloatBinVOp<string asmstr, SDNode OpNode, - NVPTXInst f64=NOP, NVPTXInst f32=NOP, - NVPTXInst f32_ftz=NOP> { - def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode, - V2F64Regs, f64>; - def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode, - V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>; - def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode, - V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>; - def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode, - V4F32Regs, f32>; - def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode, - V2F32Regs, f32>; -} - -multiclass IntUnaryVOp<string asmstr, PatFrag OpNode, - NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, - NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> { - def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode, - V2I64Regs, i64op>; - def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode, - V4I32Regs, i32op>; - def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode, - V2I32Regs, i32op>; - def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode, - V4I16Regs, i16op>; - def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode, - V2I16Regs, i16op>; - def V4I8 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode, - V4I8Regs, i8op>; - def V2I8 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode, - V2I8Regs, i8op>; -} - - -// Integer Arithmetic -let VecInstType=isVecOther.Value in { -defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>; -defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>; - -def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs, - ADDCCi32rr>; -def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs, - ADDCCi32rr>; -def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs, - SUBCCi32rr>; -def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs, - SUBCCi32rr>; -def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs, - ADDCCCi32rr>; -def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs, - ADDCCCi32rr>; -def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs, - SUBCCCi32rr>; -def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs, - SUBCCCi32rr>; - -def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs, - SHLi64rr>; -def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs, - SHLi32rr>; -def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs, - SHLi32rr>; -def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs, - SHLi16rr>; -def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs, - SHLi16rr>; -def ShiftLV2I8 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs, V2I32Regs, - SHLi8rr>; -def ShiftLV4I8 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs, V4I32Regs, - SHLi8rr>; -} - -// cvt to v*i32, helpers for shift -class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr, - NVPTXInst sInst=NOP> : - NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>; - -class VecCVTStrHelper<string op, string dest, string src> { - string s=!strconcat(op, !strconcat("\t", - !strconcat(dest, !strconcat(", ", !strconcat(src, ";"))))); -} - -class Vec2CVTStr<string op> { - string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s, - !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s)); -} - -class Vec4CVTStr<string op> { - string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s, - !strconcat("\n\t", - !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s, - !strconcat("\n\t", - !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s, - !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s)))))); -} - -let VecInstType=isVecOther.Value in { -def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs, - Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>; -def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs, - Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>; -def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs, - Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>; -def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs, - Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>; -def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs, - Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>; -} - -def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2), - (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; -def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2), - (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; -def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2), - (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; - -def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2), - (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; -def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2), - (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; - -let VecInstType=isVecOther.Value in { -def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs, - SRAi64rr>; -def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs, - SRAi32rr>; -def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs, - SRAi32rr>; -def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs, - SRAi16rr>; -def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs, - SRAi16rr>; -def ShiftRAV2I8 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs, V2I32Regs, - SRAi8rr>; -def ShiftRAV4I8 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs, V4I32Regs, - SRAi8rr>; - -def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs, - SRLi64rr>; -def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs, - SRLi32rr>; -def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs, - SRLi32rr>; -def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs, - SRLi16rr>; -def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs, - SRLi16rr>; -def ShiftRLV2I8 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs, V2I32Regs, - SRLi8rr>; -def ShiftRLV4I8 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs, V4I32Regs, - SRLi8rr>; - -defm VMult : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr, - MULTi8rr>; -defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr, - MULTHSi16rr, - MULTHSi8rr>; -defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr, - MULTHUi16rr, - MULTHUi8rr>; -defm VSDiv : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr, - SDIVi8rr>; -defm VUDiv : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr, - UDIVi8rr>; -defm VSRem : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr, - SREMi8rr>; -defm VURem : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr, - UREMi8rr>; -} - -def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2), - (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; -def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2), - (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; -def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2), - (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; - -def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2), - (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; -def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2), - (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; - -def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2), - (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; -def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2), - (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; -def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2), - (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; - -def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2), - (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; -def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2), - (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; - -multiclass VMAD<string asmstr, NVPTXRegClass regclassv4, - NVPTXRegClass regclassv2, - SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP, - Predicate Pred> { - def V4 : NVPTXVecInst<(outs regclassv4:$dst), - (ins regclassv4:$a, regclassv4:$b, regclassv4:$c), - V4MADStr<asmstr>.s, - [(set regclassv4:$dst, - (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))], - sop>, - Requires<[Pred]>; - def V2 : NVPTXVecInst<(outs regclassv2:$dst), - (ins regclassv2:$a, regclassv2:$b, regclassv2:$c), - V2MADStr<asmstr>.s, - [(set regclassv2:$dst, - (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))], - sop>, - Requires<[Pred]>; -} - -multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP, - Predicate Pred> { - def V2 : NVPTXVecInst<(outs regclass:$dst), - (ins regclass:$a, regclass:$b, regclass:$c), - V2MADStr<asmstr>.s, - [(set regclass:$dst, (add - (mul regclass:$a, regclass:$b), regclass:$c))], sop>, - Requires<[Pred]>; -} -multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP, - Predicate Pred> { - def V2 : NVPTXVecInst<(outs regclass:$dst), - (ins regclass:$a, regclass:$b, regclass:$c), - V2MADStr<asmstr>.s, - [(set regclass:$dst, (fadd - (fmul regclass:$a, regclass:$b), regclass:$c))], sop>, - Requires<[Pred]>; -} - -let VecInstType=isVecOther.Value in { -defm I8MAD : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>; -defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr, - true>; -defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr, - true>; -defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>; - -defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>; - -defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>; -defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>; -defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>; - -defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul, - FMAD32_ftzrrr, doFMADF32_ftz>; -defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul, - FMA32_ftzrrr, doFMAF32_ftz>; -defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr, - doFMADF32>; -defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr, - doFMAF32>; -defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>; -} - -let VecInstType=isVecOther.Value in { -def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs, - FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>; -def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs, - FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>; -def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs, - FDIV32rr_prec>, Requires<[reqPTX20]>; -def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs, - FDIV32rr_prec>, Requires<[reqPTX20]>; -def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs, - FDIV32rr_ftz>, Requires<[doF32FTZ]>; -def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs, - FDIV32rr_ftz>, Requires<[doF32FTZ]>; -def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>; -def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>; -def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>; -} - -def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>; - -let VecInstType=isVecOther.Value in { -def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs, - FNEGf32_ftz>, Requires<[doF32FTZ]>; -def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs, - FNEGf32_ftz>, Requires<[doF32FTZ]>; -def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>; -def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>; -def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>; - -// Logical Arithmetic -defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>; -defm VOr : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>; -defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>; - -defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>; -} - - -multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)), - (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c, V2F32Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c), - (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>, - Requires<[Pred]>; -} - -defm V2FMAF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>; -defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>; -defm V2FMAF32ext : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>; -defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>; - -multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)), - (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c, V4F32Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c), - (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>, - Requires<[Pred]>; -} - -defm V4FMAF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>; -defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>; -defm V4FMAF32ext : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>; -defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>; - -multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)), - (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c), - (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>, - Requires<[Pred]>; -} - -defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>; - -class VecModStr<string vecsize, string elem, string extra, string l=""> -{ - string t1 = !strconcat("${c", elem); - string t2 = !strconcat(t1, ":vecv"); - string t3 = !strconcat(t2, vecsize); - string t4 = !strconcat(t3, extra); - string t5 = !strconcat(t4, l); - string s = !strconcat(t5, "}"); -} -class ShuffleOneLine<string vecsize, string elem, string type> -{ - string t1 = VecModStr<vecsize, elem, "comm", "1">.s; - string t2 = !strconcat(t1, "mov."); - string t3 = !strconcat(t2, type); - string t4 = !strconcat(t3, " \t${dst}_"); - string t5 = !strconcat(t4, elem); - string t6 = !strconcat(t5, ", $src1"); - string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s); - string t8 = !strconcat(t7, ";\n\t"); - string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s); - string t10 = !strconcat(t9, "mov."); - string t11 = !strconcat(t10, type); - string t12 = !strconcat(t11, " \t${dst}_"); - string t13 = !strconcat(t12, elem); - string t14 = !strconcat(t13, ", $src2"); - string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s); - string s = !strconcat(t15, ";"); -} -class ShuffleAsmStr2<string type> -{ - string t1 = ShuffleOneLine<"2", "0", type>.s; - string t2 = !strconcat(t1, "\n\t"); - string s = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s); -} -class ShuffleAsmStr4<string type> -{ - string t1 = ShuffleOneLine<"4", "0", type>.s; - string t2 = !strconcat(t1, "\n\t"); - string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s); - string t4 = !strconcat(t3, "\n\t"); - string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s); - string t6 = !strconcat(t5, "\n\t"); - string s = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s); -} - -let hasSideEffects=0, VecInstType=isVecShuffle.Value in { -def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst), - (ins V4F32Regs:$src1, V4F32Regs:$src2, - i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", - ShuffleAsmStr4<"f32">.s), - [], FMOV32rr>; - -def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst), - (ins V4I32Regs:$src1, V4I32Regs:$src2, - i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", - ShuffleAsmStr4<"u32">.s), - [], IMOV32rr>; - -def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst), - (ins V4I16Regs:$src1, V4I16Regs:$src2, - i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", - ShuffleAsmStr4<"u16">.s), - [], IMOV16rr>; - -def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst), - (ins V4I8Regs:$src1, V4I8Regs:$src2, - i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", - ShuffleAsmStr4<"u16">.s), - [], IMOV8rr>; - -def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst), - (ins V2F32Regs:$src1, V2F32Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"f32">.s), - [], FMOV32rr>; - -def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst), - (ins V2I32Regs:$src1, V2I32Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"u32">.s), - [], IMOV32rr>; - -def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst), - (ins V2I8Regs:$src1, V2I8Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"u16">.s), - [], IMOV8rr>; - -def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst), - (ins V2I16Regs:$src1, V2I16Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"u16">.s), - [], IMOV16rr>; - -def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst), - (ins V2F64Regs:$src1, V2F64Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"f64">.s), - [], FMOV64rr>; - -def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst), - (ins V2I64Regs:$src1, V2I64Regs:$src2, - i8imm:$c0, i8imm:$c1), - !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", - ShuffleAsmStr2<"u64">.s), - [], IMOV64rr>; -} - -def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return CurDAG->getTargetConstant(SVOp->getMaskElt(0), SDLoc(N), MVT::i32); -}]>; -def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return CurDAG->getTargetConstant(SVOp->getMaskElt(1), SDLoc(N), MVT::i32); -}]>; -def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return CurDAG->getTargetConstant(SVOp->getMaskElt(2), SDLoc(N), MVT::i32); -}]>; -def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return CurDAG->getTargetConstant(SVOp->getMaskElt(3), SDLoc(N), MVT::i32); -}]>; - -// The spurious call is here to silence a compiler warning about N being -// unused. -def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), - [{ N->getGluedNode(); return true; }]>; - -def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)), - (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)), - (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), - (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; - -def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)), - (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)), - (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)), - (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), - (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; - -def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)), - (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)), - (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), - (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; - -def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)), - (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)), - (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), - (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; - -def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)), - (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2, - (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; - -class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass, - NVPTXInst si> - : NVPTXVecInst<(outs vclass:$dst), - (ins sclass:$a1, sclass:$a2), - !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"), - [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))], - si>; -class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass, - NVPTXInst si> - : NVPTXVecInst<(outs vclass:$dst), - (ins sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4), - !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"), - [(set vclass:$dst, - (build_vector sclass:$a1, sclass:$a2, - sclass:$a3, sclass:$a4))], si>; - -let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in { -def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs, - FMOV32rr>; -def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs, - FMOV64rr>; - -def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs, - IMOV32rr>; -def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs, - IMOV64rr>; -def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs, - IMOV16rr>; -def Build_Vector2_i8 : Build_Vector2<"mov.v2.u16", V2I8Regs, Int8Regs, - IMOV8rr>; - -def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs, - FMOV32rr>; - -def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs, - IMOV32rr>; -def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs, - IMOV16rr>; -def Build_Vector4_i8 : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs, - IMOV8rr>; -} - -class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP> - : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src), - !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"), - [], sop>; - -let isAsCheapAsAMove=1, hasSideEffects=0, IsSimpleMove=1, - VecInstType=isVecOther.Value in { -def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>; -def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>; - -def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>; -def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>; - -def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>; -def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>; - -def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>; -def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>; - -def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>; -def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>; -} - -// extract subvector patterns -def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR", - SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>; - -def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)), - (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0), - (V4f32Extract V4F32Regs:$src, 1))>; -def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)), - (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2), - (V4f32Extract V4F32Regs:$src, 3))>; -def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)), - (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0), - (V4i32Extract V4I32Regs:$src, 1))>; -def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)), - (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2), - (V4i32Extract V4I32Regs:$src, 3))>; -def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)), - (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0), - (V4i16Extract V4I16Regs:$src, 1))>; -def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)), - (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2), - (V4i16Extract V4I16Regs:$src, 3))>; -def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)), - (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0), - (V4i8Extract V4I8Regs:$src, 1))>; -def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)), - (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2), - (V4i8Extract V4I8Regs:$src, 3))>; - -// Select instructions -class Select_OneLine<string type, string pos> { - string t1 = !strconcat("selp.", type); - string t2 = !strconcat(t1, " \t${dst}_"); - string t3 = !strconcat(t2, pos); - string t4 = !strconcat(t3, ", ${src1}_"); - string t5 = !strconcat(t4, pos); - string t6 = !strconcat(t5, ", ${src2}_"); - string t7 = !strconcat(t6, pos); - string s = !strconcat(t7, ", $p;"); -} - -class Select_Str2<string type> { - string t1 = Select_OneLine<type, "0">.s; - string t2 = !strconcat(t1, "\n\t"); - string s = !strconcat(t2, Select_OneLine<type, "1">.s); -} - -class Select_Str4<string type> { - string t1 = Select_OneLine<type, "0">.s; - string t2 = !strconcat(t1, "\n\t"); - string t3 = !strconcat(t2, Select_OneLine<type, "1">.s); - string t4 = !strconcat(t3, "\n\t"); - string t5 = !strconcat(t4, Select_OneLine<type, "2">.s); - string t6 = !strconcat(t5, "\n\t"); - string s = !strconcat(t6, Select_OneLine<type, "3">.s); - -} - -class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop> - : NVPTXVecInst<(outs vclass:$dst), - (ins vclass:$src1, vclass:$src2, Int1Regs:$p), - asmstr, - [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1, - vclass:$src2))], - sop>; - -let VecInstType=isVecOther.Value in { -def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>; -def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>; -def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>; -def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>; -def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>; -def V4I8_Select : Vec_Select<V4I8Regs, Select_Str4<"b16">.s, SELECTi8rr>; -def V2I8_Select : Vec_Select<V2I8Regs, Select_Str2<"b16">.s, SELECTi8rr>; - -def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>; -def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>; -def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>; -} - -// Comparison instructions - -// setcc convenience fragments. -def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETOEQ)>; -def vsetogt : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETOGT)>; -def vsetoge : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETOGE)>; -def vsetolt : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETOLT)>; -def vsetole : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETOLE)>; -def vsetone : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETONE)>; -def vseto : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETO)>; -def vsetuo : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETUO)>; -def vsetueq : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETUEQ)>; -def vsetugt : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETUGT)>; -def vsetuge : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETUGE)>; -def vsetult : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETULT)>; -def vsetule : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETULE)>; -def vsetune : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETUNE)>; -def vseteq : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETEQ)>; -def vsetgt : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETGT)>; -def vsetge : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETGE)>; -def vsetlt : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETLT)>; -def vsetle : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETLE)>; -def vsetne : PatFrag<(ops node:$lhs, node:$rhs), - (setcc node:$lhs, node:$rhs, SETNE)>; - -class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass, - NVPTXInst sop> - : NVPTXVecInst<(outs outrclass:$dst), - (ins inrclass:$a, inrclass:$b), - "Unsupported", - [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))], - sop>; - -multiclass Vec_Compare_All<PatFrag op, - NVPTXInst inst8, - NVPTXInst inst16, - NVPTXInst inst32, - NVPTXInst inst64> -{ - def V2I8 : Vec_Compare<op, V2I8Regs, V2I8Regs, inst8>; - def V4I8 : Vec_Compare<op, V4I8Regs, V4I8Regs, inst8>; - def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>; - def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>; - def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>; - def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>; - def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>; -} - -let VecInstType=isVecOther.Value in { - defm VecSGT : Vec_Compare_All<vsetgt, ISetSGTi8rr_toi8, ISetSGTi16rr_toi16, - ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>; - defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16, - ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>; - defm VecSLT : Vec_Compare_All<vsetlt, ISetSLTi8rr_toi8, ISetSLTi16rr_toi16, - ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>; - defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16, - ISetULTi32rr_toi32, ISetULTi64rr_toi64>; - defm VecSGE : Vec_Compare_All<vsetge, ISetSGEi8rr_toi8, ISetSGEi16rr_toi16, - ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>; - defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16, - ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>; - defm VecSLE : Vec_Compare_All<vsetle, ISetSLEi8rr_toi8, ISetSLEi16rr_toi16, - ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>; - defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16, - ISetULEi32rr_toi32, ISetULEi64rr_toi64>; - defm VecSEQ : Vec_Compare_All<vseteq, ISetSEQi8rr_toi8, ISetSEQi16rr_toi16, - ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>; - defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16, - ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>; - defm VecSNE : Vec_Compare_All<vsetne, ISetSNEi8rr_toi8, ISetSNEi16rr_toi16, - ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>; - defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16, - ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>; -} - -multiclass FVec_Compare_All<PatFrag op, - NVPTXInst instf32, - NVPTXInst instf64> -{ - def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>; - def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>; - def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>; -} - -let VecInstType=isVecOther.Value in { - defm FVecGT : FVec_Compare_All<vsetogt, FSetGTf32rr_toi32, - FSetGTf64rr_toi64>; - defm FVecLT : FVec_Compare_All<vsetolt, FSetLTf32rr_toi32, - FSetLTf64rr_toi64>; - defm FVecGE : FVec_Compare_All<vsetoge, FSetGEf32rr_toi32, - FSetGEf64rr_toi64>; - defm FVecLE : FVec_Compare_All<vsetole, FSetLEf32rr_toi32, - FSetLEf64rr_toi64>; - defm FVecEQ : FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32, - FSetEQf64rr_toi64>; - defm FVecNE : FVec_Compare_All<vsetone, FSetNEf32rr_toi32, - FSetNEf64rr_toi64>; - - defm FVecUGT : FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32, - FSetUGTf64rr_toi64>; - defm FVecULT : FVec_Compare_All<vsetult, FSetULTf32rr_toi32, - FSetULTf64rr_toi64>; - defm FVecUGE : FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32, - FSetUGEf64rr_toi64>; - defm FVecULE : FVec_Compare_All<vsetule, FSetULEf32rr_toi32, - FSetULEf64rr_toi64>; - defm FVecUEQ : FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32, - FSetUEQf64rr_toi64>; - defm FVecUNE : FVec_Compare_All<vsetune, FSetUNEf32rr_toi32, - FSetUNEf64rr_toi64>; - - defm FVecNUM : FVec_Compare_All<vseto, FSetNUMf32rr_toi32, - FSetNUMf64rr_toi64>; - defm FVecNAN : FVec_Compare_All<vsetuo, FSetNANf32rr_toi32, - FSetNANf64rr_toi64>; -} - -class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4), - (ins i32imm:$a, i32imm:$b), - !strconcat(!strconcat("ld.param", opstr), - "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>; - -class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$d1, regclass:$d2), - (ins i32imm:$a, i32imm:$b), - !strconcat(!strconcat("ld.param", opstr), - "\t{{$d1, $d2}}, [retval0+$b];"), []>; - - -class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), - (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4, - i32imm:$a, i32imm:$b), - !strconcat(!strconcat("st.param", opstr), - "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>; - -class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), - (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b), - !strconcat(!strconcat("st.param", opstr), - "\t[param$a+$b], {{$s1, $s2}};"), []>; - -class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), - (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4, - i32imm:$a), - !strconcat(!strconcat("st.param", opstr), - "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>; - -class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), - (ins regclass:$s1, regclass:$s2, i32imm:$a), - !strconcat(!strconcat("st.param", opstr), - "\t[func_retval+$a], {{$s1, $s2}};"), []>; - -def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">; -def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">; -def LoadParamScalar4I8 : LoadParamScalar4Inst<Int8Regs, ".v4.b8">; - -def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">; -def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">; -def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">; -def LoadParamScalar2I8 : LoadParamScalar2Inst<Int32Regs, ".v2.b8">; - -def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">; -def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">; -def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">; - -def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">; -def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">; -def StoreParamScalar4I8 : StoreParamScalar4Inst<Int8Regs, ".v4.b8">; - -def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">; -def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">; -def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">; -def StoreParamScalar2I8 : StoreParamScalar2Inst<Int8Regs, ".v2.b8">; - -def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">; -def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">; -def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">; - -def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">; -def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">; -def StoreRetvalScalar4I8 : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">; - -def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">; -def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">; -def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">; -def StoreRetvalScalar2I8 : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">; - -def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">; -def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">; -def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">; - -class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>: - NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b), - "loadparam : $dst <- [$a, $b]", - [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))], - sop>; - -class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP> - : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), - "storeparam : [$a, $b] <- $val", - [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>; - -class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr, - NVPTXInst sop=NOP> - : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a), - "storeretval : retval[$a] <- $val", - [(StoreRetval (i32 imm:$a), regclass:$val)], sop>; - -let VecInstType=isVecLD.Value in { -def LoadParamV4I32 : LoadParamVecInst<V4I32Regs, ".v4.b32", - LoadParamScalar4I32>; -def LoadParamV4I16 : LoadParamVecInst<V4I16Regs, ".v4.b16", - LoadParamScalar4I16>; -def LoadParamV4I8 : LoadParamVecInst<V4I8Regs, ".v4.b8", - LoadParamScalar4I8>; - -def LoadParamV2I64 : LoadParamVecInst<V2I64Regs, ".v2.b64", - LoadParamScalar2I64>; -def LoadParamV2I32 : LoadParamVecInst<V2I32Regs, ".v2.b32", - LoadParamScalar2I32>; -def LoadParamV2I16 : LoadParamVecInst<V2I16Regs, ".v2.b16", - LoadParamScalar2I16>; -def LoadParamV2I8 : LoadParamVecInst<V2I8Regs, ".v2.b8", - LoadParamScalar2I8>; - -def LoadParamV4F32 : LoadParamVecInst<V4F32Regs, ".v4.f32", - LoadParamScalar4F32>; -def LoadParamV2F32 : LoadParamVecInst<V2F32Regs, ".v2.f32", - LoadParamScalar2F32>; -def LoadParamV2F64 : LoadParamVecInst<V2F64Regs, ".v2.f64", - LoadParamScalar2F64>; -} - -let VecInstType=isVecST.Value in { -def StoreParamV4I32 : StoreParamVecInst<V4I32Regs, ".v4.b32", - StoreParamScalar4I32>; -def StoreParamV4I16 : StoreParamVecInst<V4I16Regs, ".v4.b16", - StoreParamScalar4I16>; -def StoreParamV4I8 : StoreParamVecInst<V4I8Regs, ".v4.b8", - StoreParamScalar4I8>; - -def StoreParamV2I64 : StoreParamVecInst<V2I64Regs, ".v2.b64", - StoreParamScalar2I64>; -def StoreParamV2I32 : StoreParamVecInst<V2I32Regs, ".v2.b32", - StoreParamScalar2I32>; -def StoreParamV2I16 : StoreParamVecInst<V2I16Regs, ".v2.b16", - StoreParamScalar2I16>; -def StoreParamV2I8 : StoreParamVecInst<V2I8Regs, ".v2.b8", - StoreParamScalar2I8>; - -def StoreParamV4F32 : StoreParamVecInst<V4F32Regs, ".v4.f32", - StoreParamScalar4F32>; -def StoreParamV2F32 : StoreParamVecInst<V2F32Regs, ".v2.f32", - StoreParamScalar2F32>; -def StoreParamV2F64 : StoreParamVecInst<V2F64Regs, ".v2.f64", - StoreParamScalar2F64>; - -def StoreRetvalV4I32 : StoreRetvalVecInst<V4I32Regs, ".v4.b32", - StoreRetvalScalar4I32>; -def StoreRetvalV4I16 : StoreRetvalVecInst<V4I16Regs, ".v4.b16", - StoreRetvalScalar4I16>; -def StoreRetvalV4I8 : StoreRetvalVecInst<V4I8Regs, ".v4.b8", - StoreRetvalScalar4I8>; - -def StoreRetvalV2I64 : StoreRetvalVecInst<V2I64Regs, ".v2.b64", - StoreRetvalScalar2I64>; -def StoreRetvalV2I32 : StoreRetvalVecInst<V2I32Regs, ".v2.b32", - StoreRetvalScalar2I32>; -def StoreRetvalV2I16 : StoreRetvalVecInst<V2I16Regs, ".v2.b16", - StoreRetvalScalar2I16>; -def StoreRetvalV2I8 : StoreRetvalVecInst<V2I8Regs, ".v2.b8", - StoreRetvalScalar2I8>; - -def StoreRetvalV4F32 : StoreRetvalVecInst<V4F32Regs, ".v4.f32", - StoreRetvalScalar4F32>; -def StoreRetvalV2F32 : StoreRetvalVecInst<V2F32Regs, ".v2.f32", - StoreRetvalScalar2F32>; -def StoreRetvalV2F64 : StoreRetvalVecInst<V2F64Regs, ".v2.f64", - StoreRetvalScalar2F64>; - -} - - -// Int vector to int scalar bit convert -// v4i8 -> i32 -def : Pat<(i32 (bitconvert V4I8Regs:$s)), - (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), - (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>; -// v4i16 -> i64 -def : Pat<(i64 (bitconvert V4I16Regs:$s)), - (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), - (V4i16Extract V4I16Regs:$s,1), - (V4i16Extract V4I16Regs:$s,2), - (V4i16Extract V4I16Regs:$s,3))>; -// v2i8 -> i16 -def : Pat<(i16 (bitconvert V2I8Regs:$s)), - (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>; -// v2i16 -> i32 -def : Pat<(i32 (bitconvert V2I16Regs:$s)), - (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), - (V2i16Extract V2I16Regs:$s,1))>; -// v2i32 -> i64 -def : Pat<(i64 (bitconvert V2I32Regs:$s)), - (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), - (V2i32Extract V2I32Regs:$s,1))>; - -// Int scalar to int vector bit convert -let VecInstType=isVecDest.Value in { -// i32 -> v4i8 -def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s), - "Error!", - [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))], - I32toV4I8>; -// i64 -> v4i16 -def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s), - "Error!", - [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))], - I64toV4I16>; -// i16 -> v2i8 -def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s), - "Error!", - [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))], - I16toV2I8>; -// i32 -> v2i16 -def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s), - "Error!", - [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))], - I32toV2I16>; -// i64 -> v2i32 -def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s), - "Error!", - [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))], - I64toV2I32>; -} - -// Int vector to int vector bit convert -// v4i8 -> v2i16 -def : Pat<(v2i16 (bitconvert V4I8Regs:$s)), - (VecI32toV2I16 - (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), - (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>; -// v4i16 -> v2i32 -def : Pat<(v2i32 (bitconvert V4I16Regs:$s)), - (VecI64toV2I32 - (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1), - (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>; -// v2i16 -> v4i8 -def : Pat<(v4i8 (bitconvert V2I16Regs:$s)), - (VecI32toV4I8 - (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>; -// v2i32 -> v4i16 -def : Pat<(v4i16 (bitconvert V2I32Regs:$s)), - (VecI64toV4I16 - (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>; -// v2i64 -> v4i32 -def : Pat<(v4i32 (bitconvert V2I64Regs:$s)), - (Build_Vector4_i32 - (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0), - (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1), - (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0), - (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>; -// v4i32 -> v2i64 -def : Pat<(v2i64 (bitconvert V4I32Regs:$s)), - (Build_Vector2_i64 - (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)), - (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>; - -// Fp scalar to fp vector convert -// f64 -> v2f32 -let VecInstType=isVecDest.Value in { -def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s), - "Error!", - [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))], - F64toV2F32>; -} - -// Fp vector to fp scalar convert -// v2f32 -> f64 -def : Pat<(f64 (bitconvert V2F32Regs:$s)), - (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>; - -// Fp scalar to int vector convert -// f32 -> v4i8 -def : Pat<(v4i8 (bitconvert Float32Regs:$s)), - (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>; -// f32 -> v2i16 -def : Pat<(v2i16 (bitconvert Float32Regs:$s)), - (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>; -// f64 -> v4i16 -def : Pat<(v4i16 (bitconvert Float64Regs:$s)), - (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>; -// f64 -> v2i32 -def : Pat<(v2i32 (bitconvert Float64Regs:$s)), - (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>; - -// Int vector to fp scalar convert -// v4i8 -> f32 -def : Pat<(f32 (bitconvert V4I8Regs:$s)), - (BITCONVERT_32_I2F - (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), - (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>; -// v4i16 -> f64 -def : Pat<(f64 (bitconvert V4I16Regs:$s)), - (BITCONVERT_64_I2F - (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1), - (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>; -// v2i16 -> f32 -def : Pat<(f32 (bitconvert V2I16Regs:$s)), - (BITCONVERT_32_I2F - (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>; -// v2i32 -> f64 -def : Pat<(f64 (bitconvert V2I32Regs:$s)), - (BITCONVERT_64_I2F - (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>; - -// Int scalar to fp vector convert -// i64 -> v2f32 -def : Pat<(v2f32 (bitconvert Int64Regs:$s)), - (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>; - -// Fp vector to int scalar convert -// v2f32 -> i64 -def : Pat<(i64 (bitconvert V2F32Regs:$s)), - (BITCONVERT_64_F2I - (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>; - -// Int vector to fp vector convert -// v2i64 -> v4f32 -def : Pat<(v4f32 (bitconvert V2I64Regs:$s)), - (Build_Vector4_f32 - (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 - (V2i64Extract V2I64Regs:$s, 0)), 0)), - (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 - (V2i64Extract V2I64Regs:$s, 0)), 1)), - (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 - (V2i64Extract V2I64Regs:$s, 1)), 0)), - (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 - (V2i64Extract V2I64Regs:$s, 1)), 1)))>; -// v2i64 -> v2f64 -def : Pat<(v2f64 (bitconvert V2I64Regs:$s)), - (Build_Vector2_f64 - (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)), - (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>; -// v2i32 -> v2f32 -def : Pat<(v2f32 (bitconvert V2I32Regs:$s)), - (Build_Vector2_f32 - (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)), - (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>; -// v4i32 -> v2f64 -def : Pat<(v2f64 (bitconvert V4I32Regs:$s)), - (Build_Vector2_f64 - (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), - (V4i32Extract V4I32Regs:$s,1))), - (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), - (V4i32Extract V4I32Regs:$s,3))))>; -// v4i32 -> v4f32 -def : Pat<(v4f32 (bitconvert V4I32Regs:$s)), - (Build_Vector4_f32 - (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)), - (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)), - (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)), - (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>; -// v4i16 -> v2f32 -def : Pat<(v2f32 (bitconvert V4I16Regs:$s)), - (VecF64toV2F32 (BITCONVERT_64_I2F - (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), - (V4i16Extract V4I16Regs:$s,1), - (V4i16Extract V4I16Regs:$s,2), - (V4i16Extract V4I16Regs:$s,3))))>; - -// Fp vector to int vector convert -// v2i64 <- v4f32 -def : Pat<(v2i64 (bitconvert V4F32Regs:$s)), - (Build_Vector2_i64 - (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0), - (V4f32Extract V4F32Regs:$s,1))), - (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2), - (V4f32Extract V4F32Regs:$s,3))))>; -// v2i64 <- v2f64 -def : Pat<(v2i64 (bitconvert V2F64Regs:$s)), - (Build_Vector2_i64 - (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)), - (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>; -// v2i32 <- v2f32 -def : Pat<(v2i32 (bitconvert V2F32Regs:$s)), - (Build_Vector2_i32 - (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)), - (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>; -// v4i32 <- v2f64 -def : Pat<(v4i32 (bitconvert V2F64Regs:$s)), - (Build_Vector4_i32 - (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 - (V2f64Extract V2F64Regs:$s, 0)), 0)), - (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 - (V2f64Extract V2F64Regs:$s, 0)), 1)), - (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 - (V2f64Extract V2F64Regs:$s, 1)), 0)), - (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 - (V2f64Extract V2F64Regs:$s, 1)), 1)))>; -// v4i32 <- v4f32 -def : Pat<(v4i32 (bitconvert V4F32Regs:$s)), - (Build_Vector4_i32 - (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)), - (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)), - (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)), - (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>; -// v4i16 <- v2f32 -def : Pat<(v4i16 (bitconvert V2F32Regs:$s)), - (VecI64toV4I16 (BITCONVERT_64_F2I - (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), - (V2f32Extract V2F32Regs:$s,1))))>; diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp index d44876abf729..803d643844f8 100644 --- a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp @@ -23,7 +23,7 @@ Target &llvm::getTheNVPTXTarget64() { extern "C" void LLVMInitializeNVPTXTargetInfo() { RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx", - "NVIDIA PTX 32-bit"); + "NVIDIA PTX 32-bit", "NVPTX"); RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64", - "NVIDIA PTX 64-bit"); + "NVIDIA PTX 64-bit", "NVPTX"); } |