aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-14 22:12:13 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-14 22:12:13 +0000
commitf1a29dd3442304e183b0491fbe2d33f6c963069e (patch)
tree64b5defb92948be8b09a6f1b5c48ec60abad1325 /contrib/llvm/lib/Target
parent8a6fe8ce60ab99778558c4951d23615a0141daf0 (diff)
parent581a6d8501ff5614297da837b81ed3b6956361ea (diff)
downloadsrc-f1a29dd3442304e183b0491fbe2d33f6c963069e.tar.gz
src-f1a29dd3442304e183b0491fbe2d33f6c963069e.zip
Merge llvm, clang, lld and lldb release_40 branch r292009. Also update
build glue.
Notes
Notes: svn path=/projects/clang400-import/; revision=312197
Diffstat (limited to 'contrib/llvm/lib/Target')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def129
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td5
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp33
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp25
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp236
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp116
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp399
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td42
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstructions.td5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp36
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td8
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td4
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp198
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.h41
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp35
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp3
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp162
-rw-r--r--contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h7
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp152
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h58
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp118
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td2
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSection.h10
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp37
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h10
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp2
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp2
-rw-r--r--contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp22
-rw-r--r--contrib/llvm/lib/Target/TargetMachine.cpp9
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp3
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp17
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp2
-rw-r--r--contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h3
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td41
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp10
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp387
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrAVX512.td39
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td74
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h2
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp57
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h3
53 files changed, 1991 insertions, 585 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index e927d58ad612..d472a54d9543 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -18,9 +18,132 @@
namespace llvm {
namespace AArch64 {
-RegisterBank GPRRegBank;
-RegisterBank FPRRegBank;
-RegisterBank CCRRegBank;
+const uint32_t GPRCoverageData[] = {
+ // Classes 0-31
+ (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
+ (1u << AArch64::GPR32spRegClassID) |
+ (1u << AArch64::GPR32commonRegClassID) |
+ (1u << AArch64::GPR32sponlyRegClassID) |
+ (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
+ (1u << AArch64::GPR64spRegClassID) |
+ (1u << AArch64::GPR64commonRegClassID) |
+ (1u << AArch64::tcGPR64RegClassID) |
+ (1u << AArch64::GPR64sponlyRegClassID),
+ // Classes 32-63
+ 0,
+ // FIXME: The entries below this point can be safely removed once this is
+ // tablegenerated. It's only needed because of the hardcoded register class
+ // limit.
+ // Classes 64-96
+ 0,
+ // Classes 97-128
+ 0,
+ // Classes 129-160
+ 0,
+ // Classes 161-192
+ 0,
+ // Classes 193-224
+ 0,
+};
+
+const uint32_t FPRCoverageData[] = {
+ // Classes 0-31
+ (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
+ (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
+ (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
+ (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
+ (1u << AArch64::DDDDRegClassID),
+ // Classes 32-63
+ (1u << (AArch64::QQRegClassID - 32)) |
+ (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+ (1u
+ << (AArch64::
+ QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+ 32)) |
+ (1u << (AArch64::QQQQRegClassID - 32)) |
+ (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+ 32)) |
+ (1u
+ << (AArch64::
+ QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
+ 32)) |
+ (1u << (AArch64::QQQRegClassID - 32)) |
+ (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+ (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+ (1u
+ << (AArch64::
+ QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
+ 32)),
+ // FIXME: The entries below this point can be safely removed once this
+ // is tablegenerated. It's only needed because of the hardcoded register
+ // class limit.
+ // Classes 64-96
+ 0,
+ // Classes 97-128
+ 0,
+ // Classes 129-160
+ 0,
+ // Classes 161-192
+ 0,
+ // Classes 193-224
+ 0,
+};
+
+const uint32_t CCRCoverageData[] = {
+ // Classes 0-31
+ 1u << AArch64::CCRRegClassID,
+ // Classes 32-63
+ 0,
+ // FIXME: The entries below this point can be safely removed once this
+ // is tablegenerated. It's only needed because of the hardcoded register
+ // class limit.
+ // Classes 64-96
+ 0,
+ // Classes 97-128
+ 0,
+ // Classes 129-160
+ 0,
+ // Classes 161-192
+ 0,
+ // Classes 193-224
+ 0,
+};
+
+RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
+RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
+RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 74a01835171b..7b581a706fa2 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -159,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c5b95f282ea8..2244baacca17 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -951,10 +951,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
defm CLS : OneOperandData<0b101, "cls">;
defm CLZ : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT : OneOperandData<0b000, "rbit">;
-
-def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
-def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+defm RBIT : OneOperandData<0b000, "rbit", bitreverse>;
def REV16Wr : OneWRegData<0b001, "rev16",
UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index a5fd2fbdde19..b292c9c87dcd 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -41,28 +41,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
if (AlreadyInit)
return;
AlreadyInit = true;
- // Initialize the GPR bank.
- createRegisterBank(AArch64::GPRRegBankID, "GPR");
- // The GPR register bank is fully defined by all the registers in
- // GR64all + its subclasses.
- addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+
const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
(void)RBGPR;
assert(&AArch64::GPRRegBank == &RBGPR &&
"The order in RegBanks is messed up");
+
+ const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+ (void)RBFPR;
+ assert(&AArch64::FPRRegBank == &RBFPR &&
+ "The order in RegBanks is messed up");
+
+ const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+ (void)RBCCR;
+ assert(&AArch64::CCRRegBank == &RBCCR &&
+ "The order in RegBanks is messed up");
+
+ // The GPR register bank is fully defined by all the registers in
+ // GR64all + its subclasses.
assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
"Subclass not added?");
assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
- // Initialize the FPR bank.
- createRegisterBank(AArch64::FPRRegBankID, "FPR");
// The FPR register bank is fully defined by all the registers in
// GR64all + its subclasses.
- addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
- const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
- (void)RBFPR;
- assert(&AArch64::FPRRegBank == &RBFPR &&
- "The order in RegBanks is messed up");
assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
"Subclass not added?");
assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
@@ -70,13 +72,6 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
assert(RBFPR.getSize() == 512 &&
"FPRs should hold up to 512-bit via QQQQ sequence");
- // Initialize the CCR bank.
- createRegisterBank(AArch64::CCRRegBankID, "CCR");
- addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
- const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
- (void)RBCCR;
- assert(&AArch64::CCRRegBank == &RBCCR &&
- "The order in RegBanks is messed up");
assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
"Class not added?");
assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1a17691fc584..b8833e5a5552 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
int AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -466,28 +466,27 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
unsigned Alignment, unsigned AddressSpace) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
- Src->isVectorTy() && Alignment != 16 &&
- Src->getVectorElementType()->isIntegerTy(64)) {
- // Unaligned stores are extremely inefficient. We don't split
- // unaligned v2i64 stores because the negative impact that has shown in
- // practice on inlined memcpy code.
- // We make v2i64 stores expensive so that we will only vectorize if there
+ LT.second.is128BitVector() && Alignment < 16) {
+ // Unaligned stores are extremely inefficient. We don't split all
+ // unaligned 128-bit stores because the negative impact that has shown in
+ // practice on inlined block copy code.
+ // We make such stores expensive so that we will only vectorize if there
// are 6 other instructions getting vectorized.
- int AmortizationCost = 6;
+ const int AmortizationCost = 6;
return LT.first * 2 * AmortizationCost;
}
- if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
- Src->getVectorNumElements() < 8) {
+ if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
+ Ty->getVectorNumElements() < 8) {
// We scalarize the loads/stores because there is not v.4b register and we
// have to promote the elements to v.4h.
- unsigned NumVecElts = Src->getVectorNumElements();
+ unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 849fd3d9b44a..18287ed6653f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -102,7 +102,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 730bcdcf7afa..e48c1943cb01 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+
+ // FIXME: This is only partially true. If we have to do vector compares, any
+ // SGPR pair can be a condition register. If we have a uniform condition, we
+ // are better off doing SALU operations, where there is only one SCC. For now,
+ // we don't have a way of knowing during instruction selection if a condition
+ // will be uniform and we always use vector compares. Assume we are using
+ // vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
// SI at least has hardware support for floating point exceptions, but no way
@@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
}
//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//
+static bool fnegFoldsIntoOp(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FMA:
+ case ISD::FMAD:
+ case ISD::FSIN:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMUL_LEGACY:
+ return true;
+ default:
+ return false;
+ }
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
return SDValue();
}
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+ unsigned Op,
+ const SDLoc &SL,
+ SDValue Cond,
+ SDValue N1,
+ SDValue N2) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N1.getValueType();
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+ N1.getOperand(0), N2.getOperand(0));
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Cond = N.getOperand(0);
+ SDValue LHS = N.getOperand(1);
+ SDValue RHS = N.getOperand(2);
+
+ EVT VT = N.getValueType();
+ if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+ (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+ SDLoc(N), Cond, LHS, RHS);
+ }
+
+ bool Inv = false;
+ if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+ std::swap(LHS, RHS);
+ Inv = true;
+ }
+
+ // TODO: Support vector constants.
+ ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+ SDLoc SL(N);
+ // If one side is an fneg/fabs and the other is a constant, we can push the
+ // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+ SDValue NewLHS = LHS.getOperand(0);
+ SDValue NewRHS = RHS;
+
+ // Careful: if the neg can be folded up, don't try to pull it back down.
+ bool ShouldFoldNeg = true;
+
+ if (NewLHS.hasOneUse()) {
+ unsigned Opc = NewLHS.getOpcode();
+ if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+ ShouldFoldNeg = false;
+ if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+ ShouldFoldNeg = false;
+ }
+
+ if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FNEG)
+ NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else if (CRHS->isNegative())
+ return SDValue();
+
+ if (Inv)
+ std::swap(NewLHS, NewRHS);
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+ Cond, NewLHS, NewRHS);
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+ }
+ }
+
+ return SDValue();
+}
+
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
}
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc = N0.getOpcode();
+
+ // If the input has multiple uses and we can either fold the negate down, or
+ // the other uses cannot, give up. This both prevents unprofitable
+ // transformations and infinite loops: we won't repeatedly try to fold around
+ // a negate that has no 'good' form.
+ //
+ // TODO: Check users can fold
+ if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+ return SDValue();
+
+ SDLoc SL(N);
+ switch (Opc) {
+ case ISD::FADD: {
+ // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() != ISD::FNEG)
+ LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMUL:
+ case AMDGPUISD::FMUL_LEGACY: {
+ // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+ // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (RHS.getOpcode() == ISD::FNEG)
+ RHS = RHS.getOperand(0);
+ else
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMA:
+ case ISD::FMAD: {
+ // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+ SDValue LHS = N0.getOperand(0);
+ SDValue MHS = N0.getOperand(1);
+ SDValue RHS = N0.getOperand(2);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (MHS.getOpcode() == ISD::FNEG)
+ MHS = MHS.getOperand(0);
+ else
+ MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FP_EXTEND:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case ISD::FSIN:
+ case AMDGPUISD::SIN_HW: {
+ SDValue CvtSrc = N0.getOperand(0);
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+ // (fneg (rcp (fneg x))) -> (rcp x)
+ return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+ // (fneg (rcp x)) -> (rcp (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(Opc, SL, VT, Neg);
+ }
+ case ISD::FP_ROUND: {
+ SDValue CvtSrc = N0.getOperand(0);
+
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_round (fneg x))) -> (fp_round x)
+ return DAG.getNode(ISD::FP_ROUND, SL, VT,
+ CvtSrc.getOperand(0), N0.getOperand(1));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_round x)) -> (fp_round (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performMulLoHi24Combine(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
+ case ISD::FNEG:
+ return performFNegCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 745c9923de2e..69567aa5f713 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,7 @@ protected:
SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 513df3a9cdf3..59cba636c586 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;
-def sub_oneuse : HasOneUseBinOp<sub>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def sub_oneuse : HasOneUseBinOp<sub>;
+
def select_oneuse : HasOneUseTernaryOp<select>;
// Special conversion patterns
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a1a352642242..e90487065992 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1177007644ff..0d83b2a585bf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getCFInstrCost(unsigned Opcode);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index da9d009c542b..3cf9a1d92469 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -214,7 +214,7 @@ public:
}
bool isReg() const override {
- return isRegKind() && !Reg.Mods.hasModifiers();
+ return isRegKind() && !hasModifiers();
}
bool isRegOrImmWithInputMods(MVT type) const {
@@ -245,6 +245,15 @@ public:
return isRegOrImmWithInputMods(MVT::f64);
}
+ bool isVReg() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::VReg_64RegClassID) ||
+ isRegClass(AMDGPU::VReg_96RegClassID) ||
+ isRegClass(AMDGPU::VReg_128RegClassID) ||
+ isRegClass(AMDGPU::VReg_256RegClassID) ||
+ isRegClass(AMDGPU::VReg_512RegClassID);
+ }
+
bool isVReg32OrOff() const {
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
@@ -299,28 +308,32 @@ public:
bool isRegClass(unsigned RCID) const;
+ bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
+ return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+ }
+
bool isSCSrcB16() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
}
bool isSCSrcB32() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
}
bool isSCSrcB64() const {
- return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
}
bool isSCSrcF16() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
}
bool isSCSrcF32() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
}
bool isSCSrcF64() const {
- return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64);
}
bool isSSrcB32() const {
@@ -350,27 +363,27 @@ public:
}
bool isVCSrcB32() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
bool isVCSrcB64() const {
- return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
bool isVCSrcB16() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
}
bool isVCSrcF32() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
}
bool isVCSrcF64() const {
- return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
bool isVCSrcF16() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
}
bool isVSrcB32() const {
@@ -534,6 +547,23 @@ public:
addRegOrImmWithInputModsOperands(Inst, N);
}
+ void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+ Modifiers Mods = getModifiers();
+ Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+ assert(isRegKind());
+ addRegOperands(Inst, N);
+ }
+
+ void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasIntModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
+ void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasFPModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
if (isImm())
addImmOperands(Inst, N);
@@ -852,9 +882,12 @@ public:
StringRef &Value);
OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseReg(OperandVector &Operands);
OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
@@ -1057,7 +1090,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
}
bool AMDGPUOperand::isRegClass(unsigned RCID) const {
- return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+ return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
@@ -1468,23 +1501,28 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
- auto res = parseImm(Operands);
- if (res != MatchOperand_NoMatch) {
- return res;
- }
-
+AMDGPUAsmParser::parseReg(OperandVector &Operands) {
if (auto R = parseRegister()) {
assert(R->isReg());
R->Reg.IsForcedVOP3 = isForcedVOP3();
Operands.push_back(std::move(R));
return MatchOperand_Success;
}
- return MatchOperand_ParseFail;
+ return MatchOperand_NoMatch;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+ auto res = parseImm(Operands);
+ if (res != MatchOperand_NoMatch) {
+ return res;
+ }
+
+ return parseReg(Operands);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
// XXX: During parsing we can't determine if minus sign means
// negate-modifier or negative immediate value.
// By default we suppose it is modifier.
@@ -1514,7 +1552,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
Abs = true;
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
@@ -1548,7 +1591,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
bool Sext = false;
if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
@@ -1561,7 +1604,12 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
Parser.Lex();
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
@@ -1584,6 +1632,16 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
return MatchOperand_Success;
}
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithFPInputMods(Operands, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithIntInputMods(Operands, false);
+}
+
OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
if (Reg) {
@@ -3382,7 +3440,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
// Skip it.
continue;
} if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ Op.addRegWithFPInputModsOperands(Inst, 2);
} else if (Op.isDPPCtrl()) {
Op.addImmOperands(Inst, 1);
} else if (Op.isImm()) {
@@ -3508,7 +3566,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
// Skip it.
continue;
} else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegOrImmWithInputModsOperands(Inst, 2);
+ Op.addRegWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = I;
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 4112ad100584..48c6592ca5b2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -333,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
def DOT4_eg : DOT4_Common<0xBE>;
defm CUBE_eg : CUBE_Common<0xC0>;
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 831ac5948a68..a5c0d4923d6b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,25 +25,6 @@ using namespace llvm;
namespace {
-class SIFoldOperands : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIFoldOperands() : MachineFunctionPass(ID) {
- initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fold Operands"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
struct FoldCandidate {
MachineInstr *UseMI;
union {
@@ -79,6 +60,36 @@ struct FoldCandidate {
}
};
+class SIFoldOperands : public MachineFunctionPass {
+public:
+ static char ID;
+ MachineRegisterInfo *MRI;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
+ void foldOperand(MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+ void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+public:
+ SIFoldOperands() : MachineFunctionPass(ID) {
+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Fold Operands"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // End anonymous namespace.
INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@@ -88,6 +99,34 @@ char SIFoldOperands::ID = 0;
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+// Wrapper around isInlineConstant that understands special cases when
+// instruction types are replaced during operand folding.
+static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ unsigned OpNo,
+ const MachineOperand &OpToFold) {
+ if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
+ return true;
+
+ unsigned Opc = UseMI.getOpcode();
+ switch (Opc) {
+ case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64: {
+ // Special case for mac. Since this is replaced with mad when folded into
+ // src2, we need to check the legality for the final instruction.
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ const MCInstrDesc &MadDesc
+ = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
@@ -141,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold,
return false;
}
-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
const MachineInstr *MI) {
for (auto Candidate : FoldList) {
if (Candidate.UseMI == MI)
@@ -150,7 +189,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
return false;
}
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
@@ -160,7 +199,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
@@ -227,12 +266,12 @@ static bool isUseSafeToFold(const MachineInstr &MI,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
- unsigned UseOpIdx,
- std::vector<FoldCandidate> &FoldList,
- SmallVectorImpl<MachineInstr *> &CopiesToReplace,
- const SIInstrInfo *TII, const SIRegisterInfo &TRI,
- MachineRegisterInfo &MRI) {
+void SIFoldOperands::foldOperand(
+ MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (!isUseSafeToFold(*UseMI, UseOp))
@@ -264,7 +303,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
for (MachineRegisterInfo::use_iterator
- RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+ RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
RSUse != RSE; ++RSUse) {
MachineInstr *RSUseMI = RSUse->getParent();
@@ -272,7 +311,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
continue;
foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
+ CopiesToReplace);
}
return;
@@ -287,8 +326,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI.getRegClass(DestReg) :
- TRI.getPhysRegClass(DestReg);
+ MRI->getRegClass(DestReg) :
+ TRI->getPhysRegClass(DestReg);
unsigned MovOp = TII->getMovOpcode(DestRC);
if (MovOp == AMDGPU::COPY)
@@ -318,7 +357,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
const TargetRegisterClass *FoldRC =
- TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+ TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
OpToFold.getImm());
@@ -328,8 +367,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC
= TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI.getRegClass(UseReg) :
- TRI.getPhysRegClass(UseReg);
+ MRI->getRegClass(UseReg) :
+ TRI->getPhysRegClass(UseReg);
assert(Imm.getBitWidth() == 64);
@@ -349,20 +388,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
}
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
- int32_t LHS, int32_t RHS) {
+ uint32_t LHS, uint32_t RHS) {
switch (Opcode) {
case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::V_AND_B32_e32:
case AMDGPU::S_AND_B32:
Result = LHS & RHS;
return true;
case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::V_OR_B32_e32:
case AMDGPU::S_OR_B32:
Result = LHS | RHS;
return true;
case AMDGPU::V_XOR_B32_e64:
+ case AMDGPU::V_XOR_B32_e32:
case AMDGPU::S_XOR_B32:
Result = LHS ^ RHS;
return true;
+ case AMDGPU::V_LSHL_B32_e64:
+ case AMDGPU::V_LSHL_B32_e32:
+ case AMDGPU::S_LSHL_B32:
+ // The instruction ignores the high bits for out of bounds shifts.
+ Result = LHS << (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHLREV_B32_e64:
+ case AMDGPU::V_LSHLREV_B32_e32:
+ Result = RHS << (LHS & 31);
+ return true;
+ case AMDGPU::V_LSHR_B32_e64:
+ case AMDGPU::V_LSHR_B32_e32:
+ case AMDGPU::S_LSHR_B32:
+ Result = LHS >> (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_LSHRREV_B32_e32:
+ Result = RHS >> (LHS & 31);
+ return true;
+ case AMDGPU::V_ASHR_I32_e64:
+ case AMDGPU::V_ASHR_I32_e32:
+ case AMDGPU::S_ASHR_I32:
+ Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+ return true;
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_ASHRREV_I32_e32:
+ Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+ return true;
default:
return false;
}
@@ -390,33 +460,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
stripExtraCopyOperands(MI);
}
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+ MachineOperand &Op) {
+ if (Op.isReg()) {
+ // If this has a subregister, it obviously is a register source.
+ if (Op.getSubReg() != AMDGPU::NoSubRegister)
+ return &Op;
+
+ MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+ if (Def->isMoveImmediate()) {
+ MachineOperand &ImmSrc = Def->getOperand(1);
+ if (ImmSrc.isImm())
+ return &ImmSrc;
+ }
+ }
+
+ return &Op;
+}
+
// Try to simplify operations with a constant that may appear after instruction
// selection.
+// TODO: See if a frame index with a fixed offset can fold.
static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
const SIInstrInfo *TII,
- MachineInstr *MI) {
+ MachineInstr *MI,
+ MachineOperand *ImmOp) {
unsigned Opc = MI->getOpcode();
-
if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) {
- MachineOperand &Src0 = MI->getOperand(1);
- if (Src0.isImm()) {
- Src0.setImm(~Src0.getImm());
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
- return true;
- }
-
- return false;
+ MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ return true;
}
- if (!MI->isCommutable())
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
return false;
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+ MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
- MachineOperand *Src0 = &MI->getOperand(Src0Idx);
- MachineOperand *Src1 = &MI->getOperand(Src1Idx);
if (!Src0->isImm() && !Src1->isImm())
return false;
@@ -431,19 +515,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
const SIRegisterInfo &TRI = TII->getRegisterInfo();
bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
- Src0->setImm(NewImm);
+ // Be careful to change the right operand, src0 may belong to a different
+ // instruction.
+ MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
MI->RemoveOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
return true;
}
+ if (!MI->isCommutable())
+ return false;
+
if (Src0->isImm() && !Src1->isImm()) {
std::swap(Src0, Src1);
std::swap(Src0Idx, Src1Idx);
}
int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
- if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+ if (Opc == AMDGPU::V_OR_B32_e64 ||
+ Opc == AMDGPU::V_OR_B32_e32 ||
+ Opc == AMDGPU::S_OR_B32) {
if (Src1Val == 0) {
// y = or x, 0 => y = copy x
MI->RemoveOperand(Src1Idx);
@@ -459,6 +550,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
}
if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
MI->getOpcode() == AMDGPU::S_AND_B32) {
if (Src1Val == 0) {
// y = and x, 0 => y = v_mov_b32 0
@@ -476,29 +568,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
}
if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
MI->getOpcode() == AMDGPU::S_XOR_B32) {
if (Src1Val == 0) {
// y = xor x, 0 => y = copy x
MI->RemoveOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ return true;
}
}
return false;
}
+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+ MachineOperand &OpToFold) const {
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+ SmallVector<FoldCandidate, 4> FoldList;
+ MachineOperand &Dst = MI.getOperand(0);
+
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ if (FoldingImm) {
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
+
+ MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; Use = NextUse) {
+ NextUse = std::next(Use);
+ MachineInstr *UseMI = Use->getParent();
+ unsigned OpNo = Use.getOperandNo();
+
+ // Folding the immediate may reveal operations that can be constant
+ // folded or replaced with a copy. This can happen for example after
+ // frame indices are lowered to constants or from splitting 64-bit
+ // constants.
+ //
+ // We may also encounter cases where one or both operands are
+ // immediates materialized into a register, which would ordinarily not
+ // be folded due to multiple uses or operand constraints.
+
+ if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+ DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+ // Some constant folding cases change the same immediate's use to a new
+ // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+ // again. The same constant folded instruction could also have a second
+ // use operand.
+ NextUse = MRI->use_begin(Dst.getReg());
+ continue;
+ }
+
+ // Try to fold any inline immediate uses, and then only fold other
+ // constants if they have one use.
+ //
+ // The legality of the inline immediate must be checked based on the use
+ // operand, not the defining instruction, because 32-bit instructions
+ // with 32-bit inline immediate sources may be used to materialize
+ // constants used in 16-bit operands.
+ //
+ // e.g. it is unsafe to fold:
+ // s_mov_b32 s0, 1.0 // materializes 0x3f800000
+ // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+ // Folding immediates with more than one use will increase program size.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else {
+ if (++NumLiteralUses == 1) {
+ NonInlineUse = &*Use;
+ NonInlineUseOpNo = OpNo;
+ }
+ }
+ }
+
+ if (NumLiteralUses == 1) {
+ MachineInstr *UseMI = NonInlineUse->getParent();
+ foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+ }
+ } else {
+ // Folding register.
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; ++Use) {
+ MachineInstr *UseMI = Use->getParent();
+
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+ FoldList, CopiesToReplace);
+ }
+ }
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(*MF);
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, *TRI)) {
+ // Clear kill flags.
+ if (Fold.isReg()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI->clearKillFlags(Fold.OpToFold->getReg());
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ }
+ }
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
@@ -512,8 +711,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
- // FIXME: We could also be folding things like FrameIndexes and
- // TargetIndexes.
+ // FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())
continue;
@@ -532,90 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;
- // We need mutate the operands of new mov instructions to add implicit
- // uses of EXEC, but adding them invalidates the use_iterator, so defer
- // this.
- SmallVector<MachineInstr *, 4> CopiesToReplace;
-
- std::vector<FoldCandidate> FoldList;
- if (FoldingImm) {
- unsigned NumLiteralUses = 0;
- MachineOperand *NonInlineUse = nullptr;
- int NonInlineUseOpNo = -1;
-
- // Try to fold any inline immediate uses, and then only fold other
- // constants if they have one use.
- //
- // The legality of the inline immediate must be checked based on the use
- // operand, not the defining instruction, because 32-bit instructions
- // with 32-bit inline immediate sources may be used to materialize
- // constants used in 16-bit operands.
- //
- // e.g. it is unsafe to fold:
- // s_mov_b32 s0, 1.0 // materializes 0x3f800000
- // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
- // Folding immediates with more than one use will increase program size.
- // FIXME: This will also reduce register usage, which may be better
- // in some cases. A better heuristic is needed.
- for (MachineRegisterInfo::use_iterator
- Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
- Use != E; ++Use) {
- MachineInstr *UseMI = Use->getParent();
- unsigned OpNo = Use.getOperandNo();
-
- if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList,
- CopiesToReplace, TII, TRI, MRI);
- } else {
- if (++NumLiteralUses == 1) {
- NonInlineUse = &*Use;
- NonInlineUseOpNo = OpNo;
- }
- }
- }
-
- if (NumLiteralUses == 1) {
- MachineInstr *UseMI = NonInlineUse->getParent();
- foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
- CopiesToReplace, TII, TRI, MRI);
- }
- } else {
- // Folding register.
- for (MachineRegisterInfo::use_iterator
- Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
- Use != E; ++Use) {
- MachineInstr *UseMI = Use->getParent();
-
- foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
- }
- }
-
- // Make sure we add EXEC uses to any new v_mov instructions created.
- for (MachineInstr *Copy : CopiesToReplace)
- Copy->addImplicitDefUseOperands(MF);
-
- for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, TRI)) {
- // Clear kill flags.
- if (Fold.isReg()) {
- assert(Fold.OpToFold && Fold.OpToFold->isReg());
- // FIXME: Probably shouldn't bother trying to fold if not an
- // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
- // copies.
- MRI.clearKillFlags(Fold.OpToFold->getReg());
- }
- DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
- static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
-
- // Folding the immediate may reveal operations that can be constant
- // folded or replaced with a copy. This can happen for example after
- // frame indices are lowered to constants or from splitting 64-bit
- // constants.
- tryConstantFoldOp(MRI, TII, Fold.UseMI);
- }
- }
+ foldInstOperand(MI, OpToFold);
}
}
return false;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 34096e158039..ebaefae3bfef 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -557,6 +557,27 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+def FPVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithFPInputMods";
+ let ParserMethod = "parseRegWithFPInputMods";
+ let PredicateMethod = "isVReg";
+}
+
+def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def IntVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithIntInputMods";
+ let ParserMethod = "parseRegWithIntInputMods";
+ let PredicateMethod = "isVReg";
+}
+
+def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndIntInputMods";
+}
+
+
//===----------------------------------------------------------------------===//
// Complex patterns
//===----------------------------------------------------------------------===//
@@ -761,6 +782,15 @@ class getSrcMod <ValueType VT> {
);
}
+// Return type of input modifiers operand specified input operand for SDWA/DPP
+class getSrcModExt <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
+}
+
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
@@ -1001,6 +1031,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
field Operand Src0Mod = getSrcMod<Src0VT>.ret;
field Operand Src1Mod = getSrcMod<Src1VT>.ret;
field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+ field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+ field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
field bit HasDst32 = HasDst;
@@ -1038,15 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod>.ret;
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+ HasModifiers, Src0ModSDWA, Src1ModSDWA,
+ DstVT>.ret;
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc35c2edc8d3..b86c04191189 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -871,6 +871,11 @@ def : Pat <
>;
def : Pat <
+ (i16 (sext_inreg i16:$src, i1)),
+ (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
+>;
+
+def : Pat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b27d7c691032..dd31dc690840 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -84,12 +84,17 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
// FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
// a special case for it. It can only be shrunk if the third operand
// is vcc. We should handle this the same way we handle vopc, by addding
- // a register allocation hint pre-regalloc and then do the shrining
+ // a register allocation hint pre-regalloc and then do the shrinking
// post-regalloc.
if (Src2) {
switch (MI.getOpcode()) {
default: return false;
+ case AMDGPU::V_ADDC_U32_e64:
+ case AMDGPU::V_SUBB_U32_e64:
+ // Additional verification is needed for sdst/src2.
+ return true;
+
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
if (!isVGPR(Src2, TRI, MRI) ||
@@ -174,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
const MachineOperand &Orig) {
for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.getReg() == AMDGPU::VCC) {
+ if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
Use.setIsUndef(Orig.isUndef());
Use.setIsKill(Orig.isKill());
return;
@@ -456,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ // Check for the bool flag output for instructions like V_ADD_I32_e64.
+ const MachineOperand *SDst = TII->getNamedOperand(MI,
+ AMDGPU::OpName::sdst);
+
+ // Check the carry-in operand for v_addc_u32_e64.
+ const MachineOperand *Src2 = TII->getNamedOperand(MI,
+ AMDGPU::OpName::src2);
+
+ if (SDst) {
+ if (SDst->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+ MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
+ continue;
+ }
+
+ // All of the instructions with carry outs also have an SGPR input in
+ // src2.
+ if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+ MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+
+ continue;
+ }
+ }
+
// We can shrink this instruction
DEBUG(dbgs() << "Shrinking " << MI);
@@ -481,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (Src1)
Inst32.addOperand(*Src1);
- const MachineOperand *Src2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2) {
int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
if (Op32Src2Idx != -1) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index bff706cdc1dc..a15b9ceff2f4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -232,7 +232,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+ let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 20fb7f7bcab7..00e5ab3db0b7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -183,13 +183,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
- let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
- FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+ let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
- FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
VGPR_32:$src2, // stub argument
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index c431d9db801e..16a456da3c67 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -517,8 +517,8 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$sdst, $src0_modifiers, $src1";
- let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
- Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
let HasSrc1Mods = 0;
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afba1587a743..32b7c87e61bb 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -608,15 +608,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
// a __gnu_ prefix (which is the default).
if (Subtarget->isTargetAEABI()) {
- setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
- setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
- setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f");
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
+ { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
+ { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
}
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
else
addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+
if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
!Subtarget->isThumb1Only()) {
addRegisterClass(MVT::f32, &ARM::SPRRegClass);
@@ -976,6 +988,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
+
// Register based DivRem for AEABI (RTABI 4.2)
if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
@@ -984,29 +997,49 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UREM, MVT::i64, Custom);
HasStandaloneRem = false;
- for (const auto &LC :
- {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
- setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
- : "__aeabi_idivmod");
- setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
- ? "__rt_sdiv64"
- : "__aeabi_ldivmod");
- for (const auto &LC :
- {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
- setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
- : "__aeabi_uidivmod");
- setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
- ? "__rt_udiv64"
- : "__aeabi_uldivmod");
-
- setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
- setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+ if (Subtarget->isTargetWindows()) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
+
+ { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
+ } else {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
+
+ { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+ { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
+ }
setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
@@ -3305,11 +3338,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- case Intrinsic::arm_rbit: {
- assert(Op.getOperand(1).getValueType() == MVT::i32 &&
- "RBIT intrinsic must have i32 type!");
- return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
- }
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -9232,12 +9260,102 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
return SDValue();
}
-// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
-// (only after legalization).
-static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+static bool IsVUZPShuffleNode(SDNode *N) {
+ // VUZP shuffle node.
+ if (N->getOpcode() == ARMISD::VUZP)
+ return true;
+
+ // "VUZP" on i32 is an alias for VTRN.
+ if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
+ return true;
+
+ return false;
+}
+
+static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
+ // Look for ADD(VUZP.0, VUZP.1).
+ if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
+ N0 == N1)
+ return SDValue();
+
+ // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
+ if (!N->getValueType(0).is64BitVector())
+ return SDValue();
+ // Generate vpadd.
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc dl(N);
+ SDNode *Unzip = N0.getNode();
+ EVT VT = N->getValueType(0);
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
+ TLI.getPointerTy(DAG.getDataLayout())));
+ Ops.push_back(Unzip->getOperand(0));
+ Ops.push_back(Unzip->getOperand(1));
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Check for two extended operands.
+ if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
+ N1.getOpcode() == ISD::SIGN_EXTEND) &&
+ !(N0.getOpcode() == ISD::ZERO_EXTEND &&
+ N1.getOpcode() == ISD::ZERO_EXTEND))
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+
+ // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
+ if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
+ N00 == N10)
+ return SDValue();
+
+ // We only recognize Q register paddl here; this can't be reached until
+ // after type legalization.
+ if (!N00.getValueType().is64BitVector() ||
+ !N0.getValueType().is128BitVector())
+ return SDValue();
+
+ // Generate vpaddl.
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ SmallVector<SDValue, 8> Ops;
+ // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
+ unsigned Opcode;
+ if (N0.getOpcode() == ISD::SIGN_EXTEND)
+ Opcode = Intrinsic::arm_neon_vpaddls;
+ else
+ Opcode = Intrinsic::arm_neon_vpaddlu;
+ Ops.push_back(DAG.getConstant(Opcode, dl,
+ TLI.getPointerTy(DAG.getDataLayout())));
+ EVT ElemTy = N00.getValueType().getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
+ N00.getOperand(0), N00.getOperand(1));
+ Ops.push_back(Concat);
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
+// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
+// much easier to match.
+static SDValue
+AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
// Only perform optimization if after legalize, and if NEON is available. We
// also expected both operands to be BUILD_VECTORs.
if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
@@ -9293,6 +9411,10 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+ // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
+ if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
+ return SDValue();
+
// Create VPADDL node.
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -9564,9 +9686,15 @@ static SDValue PerformADDCCombine(SDNode *N,
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget){
+ // Attempt to create vpadd for this add.
+ if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
+ return Result;
// Attempt to create vpaddl for this add.
- if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
+ if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
+ return Result;
+ if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
+ Subtarget))
return Result;
// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index 5255d82d647a..7a7f91f4d3c4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -16,16 +16,28 @@
#define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Target/TargetLowering.h"
-#include <vector>
+#include <utility>
namespace llvm {
- class ARMConstantPoolValue;
- class ARMSubtarget;
+
+class ARMSubtarget;
+class InstrItineraryData;
namespace ARMISD {
+
// ARM Specific DAG Nodes
enum NodeType : unsigned {
// Start the numbering where the builtin ops and target ops leave off.
@@ -217,12 +229,15 @@ namespace llvm {
VST3LN_UPD,
VST4LN_UPD
};
- }
+
+ } // end namespace ARMISD
/// Define some predicates that are used for node matching.
namespace ARM {
+
bool isBitFieldInvertedMask(unsigned v);
- }
+
+ } // end namespace ARM
//===--------------------------------------------------------------------===//
// ARMTargetLowering - ARM Implementation of the TargetLowering interface
@@ -531,6 +546,7 @@ namespace llvm {
std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+
void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
SDValue &Arg, RegsToPassVector &RegsToPass,
CCValAssign &VA, CCValAssign &NextVA,
@@ -623,6 +639,7 @@ namespace llvm {
return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
}
+
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
MachineBasicBlock *Entry,
@@ -644,9 +661,8 @@ namespace llvm {
unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
bool ForceMutable = false) const;
- SDValue
- LowerCall(TargetLowering::CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
/// HandleByVal - Target-specific cleanup for ByVal support.
void HandleByVal(CCState *, unsigned &, unsigned) const override;
@@ -712,9 +728,12 @@ namespace llvm {
};
namespace ARM {
+
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo);
- }
-}
-#endif // ARMISELLOWERING_H
+ } // end namespace ARM
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 9bd036a1eace..324087d670b5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -29,7 +29,33 @@ using namespace llvm;
// into an ARMGenRegisterBankInfo.def (similar to AArch64).
namespace llvm {
namespace ARM {
-RegisterBank GPRRegBank;
+const uint32_t GPRCoverageData[] = {
+ // Classes 0-31
+ (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
+ (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
+ (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
+ (1u << ARM::GPRnopc_and_hGPRRegClassID) |
+ (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
+ (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
+ (1u << ARM::hGPR_and_tcGPRRegClassID),
+ // Classes 32-63
+ 0,
+ // Classes 64-96
+ 0,
+ // FIXME: Some of the entries below this point can be safely removed once
+ // this is tablegenerated. It's only needed because of the hardcoded
+ // register class limit.
+ // Classes 97-128
+ 0,
+ // Classes 129-160
+ 0,
+ // Classes 161-192
+ 0,
+ // Classes 193-224
+ 0,
+};
+
+RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData);
RegisterBank *RegBanks[] = {&GPRRegBank};
RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
@@ -51,14 +77,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
return;
AlreadyInit = true;
- // Initialize the GPR bank.
- createRegisterBank(ARM::GPRRegBankID, "GPRB");
-
- addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI);
- addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI);
const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
(void)RBGPR;
assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+
+ // Initialize the GPR bank.
assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
"Subclass not added?");
assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc001b596785..2b6b36bc3e68 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -433,7 +433,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
int ARMTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 731a5adf3d73..3c83cd92a61a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,7 +114,8 @@ public:
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7fcb3ce45bbb..d95c16fc3caf 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
switch (ISD) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 26e0f9a94368..f28e8b36fdbc 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -14,11 +14,13 @@
#include "MipsMachineFunction.h"
#include "MipsRegisterInfo.h"
#include "MipsTargetMachine.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
@@ -1456,9 +1458,12 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
return Result;
}
-static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
- return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op),
- Op->getValueType(0));
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG,
+ bool IsSigned = false) {
+ return DAG.getConstant(
+ APInt(Op->getValueType(0).getScalarType().getSizeInBits(),
+ Op->getConstantOperandVal(ImmOp), IsSigned),
+ SDLoc(Op), Op->getValueType(0));
}
static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
@@ -1564,8 +1569,8 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
-
- switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+ unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue();
+ switch (Intrinsic) {
default:
return SDValue();
case Intrinsic::mips_shilo:
@@ -1635,6 +1640,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
// binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
EVT VecTy = Op->getValueType(0);
EVT EltTy = VecTy.getVectorElementType();
+ if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+ report_fatal_error("Immediate out of range");
APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
Op->getConstantOperandVal(3));
return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1648,6 +1655,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
// binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
EVT VecTy = Op->getValueType(0);
EVT EltTy = VecTy.getVectorElementType();
+ if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+ report_fatal_error("Immediate out of range");
APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
Op->getConstantOperandVal(3));
return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1741,7 +1750,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_ceqi_w:
case Intrinsic::mips_ceqi_d:
return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
- lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+ lowerMSASplatImm(Op, 2, DAG, true), ISD::SETEQ);
case Intrinsic::mips_cle_s_b:
case Intrinsic::mips_cle_s_h:
case Intrinsic::mips_cle_s_w:
@@ -1753,7 +1762,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_clei_s_w:
case Intrinsic::mips_clei_s_d:
return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
- lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+ lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLE);
case Intrinsic::mips_cle_u_b:
case Intrinsic::mips_cle_u_h:
case Intrinsic::mips_cle_u_w:
@@ -1777,7 +1786,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_clti_s_w:
case Intrinsic::mips_clti_s_d:
return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
- lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+ lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLT);
case Intrinsic::mips_clt_u_b:
case Intrinsic::mips_clt_u_h:
case Intrinsic::mips_clt_u_w:
@@ -1990,15 +1999,28 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_insve_b:
case Intrinsic::mips_insve_h:
case Intrinsic::mips_insve_w:
- case Intrinsic::mips_insve_d:
+ case Intrinsic::mips_insve_d: {
+ // Report an error for out of range values.
+ int64_t Max;
+ switch (Intrinsic) {
+ case Intrinsic::mips_insve_b: Max = 15; break;
+ case Intrinsic::mips_insve_h: Max = 7; break;
+ case Intrinsic::mips_insve_w: Max = 3; break;
+ case Intrinsic::mips_insve_d: Max = 1; break;
+ default: llvm_unreachable("Unmatched intrinsic");
+ }
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ if (Value < 0 || Value > Max)
+ report_fatal_error("Immediate out of range");
return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
DAG.getConstant(0, DL, MVT::i32));
+ }
case Intrinsic::mips_ldi_b:
case Intrinsic::mips_ldi_h:
case Intrinsic::mips_ldi_w:
case Intrinsic::mips_ldi_d:
- return lowerMSASplatImm(Op, 1, DAG);
+ return lowerMSASplatImm(Op, 1, DAG, true);
case Intrinsic::mips_lsa:
case Intrinsic::mips_dlsa: {
EVT ResTy = Op->getValueType(0);
@@ -2032,7 +2054,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_maxi_s_w:
case Intrinsic::mips_maxi_s_d:
return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
- Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
case Intrinsic::mips_maxi_u_b:
case Intrinsic::mips_maxi_u_h:
case Intrinsic::mips_maxi_u_w:
@@ -2056,7 +2078,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_mini_s_w:
case Intrinsic::mips_mini_s_d:
return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
- Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
case Intrinsic::mips_mini_u_b:
case Intrinsic::mips_mini_u_h:
case Intrinsic::mips_mini_u_w:
@@ -2129,11 +2151,59 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_pcnt_w:
case Intrinsic::mips_pcnt_d:
return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+ case Intrinsic::mips_sat_s_b:
+ case Intrinsic::mips_sat_s_h:
+ case Intrinsic::mips_sat_s_w:
+ case Intrinsic::mips_sat_s_d:
+ case Intrinsic::mips_sat_u_b:
+ case Intrinsic::mips_sat_u_h:
+ case Intrinsic::mips_sat_u_w:
+ case Intrinsic::mips_sat_u_d: {
+ // Report an error for out of range values.
+ int64_t Max;
+ switch (Intrinsic) {
+ case Intrinsic::mips_sat_s_b:
+ case Intrinsic::mips_sat_u_b: Max = 7; break;
+ case Intrinsic::mips_sat_s_h:
+ case Intrinsic::mips_sat_u_h: Max = 15; break;
+ case Intrinsic::mips_sat_s_w:
+ case Intrinsic::mips_sat_u_w: Max = 31; break;
+ case Intrinsic::mips_sat_s_d:
+ case Intrinsic::mips_sat_u_d: Max = 63; break;
+ default: llvm_unreachable("Unmatched intrinsic");
+ }
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ if (Value < 0 || Value > Max)
+ report_fatal_error("Immediate out of range");
+ return SDValue();
+ }
case Intrinsic::mips_shf_b:
case Intrinsic::mips_shf_h:
- case Intrinsic::mips_shf_w:
+ case Intrinsic::mips_shf_w: {
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ if (Value < 0 || Value > 255)
+ report_fatal_error("Immediate out of range");
return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
Op->getOperand(2), Op->getOperand(1));
+ }
+ case Intrinsic::mips_sldi_b:
+ case Intrinsic::mips_sldi_h:
+ case Intrinsic::mips_sldi_w:
+ case Intrinsic::mips_sldi_d: {
+ // Report an error for out of range values.
+ int64_t Max;
+ switch (Intrinsic) {
+ case Intrinsic::mips_sldi_b: Max = 15; break;
+ case Intrinsic::mips_sldi_h: Max = 7; break;
+ case Intrinsic::mips_sldi_w: Max = 3; break;
+ case Intrinsic::mips_sldi_d: Max = 1; break;
+ default: llvm_unreachable("Unmatched intrinsic");
+ }
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(3))->getSExtValue();
+ if (Value < 0 || Value > Max)
+ report_fatal_error("Immediate out of range");
+ return SDValue();
+ }
case Intrinsic::mips_sll_b:
case Intrinsic::mips_sll_h:
case Intrinsic::mips_sll_w:
@@ -2176,6 +2246,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_srai_d:
return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_srari_b:
+ case Intrinsic::mips_srari_h:
+ case Intrinsic::mips_srari_w:
+ case Intrinsic::mips_srari_d: {
+ // Report an error for out of range values.
+ int64_t Max;
+ switch (Intrinsic) {
+ case Intrinsic::mips_srari_b: Max = 7; break;
+ case Intrinsic::mips_srari_h: Max = 15; break;
+ case Intrinsic::mips_srari_w: Max = 31; break;
+ case Intrinsic::mips_srari_d: Max = 63; break;
+ default: llvm_unreachable("Unmatched intrinsic");
+ }
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ if (Value < 0 || Value > Max)
+ report_fatal_error("Immediate out of range");
+ return SDValue();
+ }
case Intrinsic::mips_srl_b:
case Intrinsic::mips_srl_h:
case Intrinsic::mips_srl_w:
@@ -2188,6 +2276,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_srli_d:
return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+ case Intrinsic::mips_srlri_b:
+ case Intrinsic::mips_srlri_h:
+ case Intrinsic::mips_srlri_w:
+ case Intrinsic::mips_srlri_d: {
+ // Report an error for out of range values.
+ int64_t Max;
+ switch (Intrinsic) {
+ case Intrinsic::mips_srlri_b: Max = 7; break;
+ case Intrinsic::mips_srlri_h: Max = 15; break;
+ case Intrinsic::mips_srlri_w: Max = 31; break;
+ case Intrinsic::mips_srlri_d: Max = 63; break;
+ default: llvm_unreachable("Unmatched intrinsic");
+ }
+ int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ if (Value < 0 || Value > Max)
+ report_fatal_error("Immediate out of range");
+ return SDValue();
+ }
case Intrinsic::mips_subv_b:
case Intrinsic::mips_subv_h:
case Intrinsic::mips_subv_w:
@@ -2219,7 +2325,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
-static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+ const MipsSubtarget &Subtarget) {
SDLoc DL(Op);
SDValue ChainIn = Op->getOperand(0);
SDValue Address = Op->getOperand(2);
@@ -2227,6 +2334,12 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
EVT ResTy = Op->getValueType(0);
EVT PtrTy = Address->getValueType(0);
+ // For N64 addresses have the underlying type MVT::i64. This intrinsic
+ // however takes an i32 signed constant offset. The actual type of the
+ // intrinsic is a scaled signed i10.
+ if (Subtarget.isABI_N64())
+ Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
/* Alignment = */ 16);
@@ -2282,11 +2395,12 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::mips_ld_h:
case Intrinsic::mips_ld_w:
case Intrinsic::mips_ld_d:
- return lowerMSALoadIntr(Op, DAG, Intr);
+ return lowerMSALoadIntr(Op, DAG, Intr, Subtarget);
}
}
-static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+ const MipsSubtarget &Subtarget) {
SDLoc DL(Op);
SDValue ChainIn = Op->getOperand(0);
SDValue Value = Op->getOperand(2);
@@ -2294,6 +2408,12 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
SDValue Offset = Op->getOperand(4);
EVT PtrTy = Address->getValueType(0);
+ // For N64 addresses have the underlying type MVT::i64. This intrinsic
+ // however takes an i32 signed constant offset. The actual type of the
+ // intrinsic is a scaled signed i10.
+ if (Subtarget.isABI_N64())
+ Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
@@ -2310,7 +2430,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::mips_st_h:
case Intrinsic::mips_st_w:
case Intrinsic::mips_st_d:
- return lowerMSAStoreIntr(Op, DAG, Intr);
+ return lowerMSAStoreIntr(Op, DAG, Intr, Subtarget);
}
}
@@ -3377,8 +3497,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
unsigned Wd = MI.getOperand(0).getReg();
unsigned Fs = MI.getOperand(1).getReg();
- unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
- unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ unsigned Wt1 = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+ : &Mips::MSA128WEvensRegClass);
+ unsigned Wt2 = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+ : &Mips::MSA128WEvensRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
index a2d670f8d39d..7fc0156216f5 100644
--- a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -27,7 +27,8 @@ class ManagedStringPool {
SmallVector<std::string *, 8> Pool;
public:
- ManagedStringPool() {}
+ ManagedStringPool() = default;
+
~ManagedStringPool() {
SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
while (Current != Pool.end()) {
@@ -43,6 +44,6 @@ public:
}
};
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 04c8d5c0443e..3c2594c77f45 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -12,42 +12,83 @@
//
//===----------------------------------------------------------------------===//
-#include "NVPTXAsmPrinter.h"
#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "MCTargetDesc/NVPTXMCAsmInfo.h"
#include "NVPTX.h"
-#include "NVPTXInstrInfo.h"
+#include "NVPTXAsmPrinter.h"
#include "NVPTXMCExpr.h"
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
#include "cl_common_defines.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <new>
#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEPOTNAME "__local_depot"
@@ -62,11 +103,11 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
cl::desc("NVPTX Specific: Emit source line in ptx file"),
cl::init(false));
-namespace {
/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
/// depends.
-void DiscoverDependentGlobals(const Value *V,
- DenseSet<const GlobalVariable *> &Globals) {
+static void
+DiscoverDependentGlobals(const Value *V,
+ DenseSet<const GlobalVariable *> &Globals) {
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
Globals.insert(GV);
else {
@@ -80,11 +121,12 @@ void DiscoverDependentGlobals(const Value *V,
/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
/// instances to be emitted, but only after any dependents have been added
-/// first.
-void VisitGlobalVariableForEmission(
- const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
- DenseSet<const GlobalVariable *> &Visited,
- DenseSet<const GlobalVariable *> &Visiting) {
+/// first.s
+static void
+VisitGlobalVariableForEmission(const GlobalVariable *GV,
+ SmallVectorImpl<const GlobalVariable *> &Order,
+ DenseSet<const GlobalVariable *> &Visited,
+ DenseSet<const GlobalVariable *> &Visiting) {
// Have we already visited this one?
if (Visited.count(GV))
return;
@@ -108,7 +150,6 @@ void VisitGlobalVariableForEmission(
Visited.insert(GV);
Visiting.erase(GV);
}
-}
void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
if (!EmitLineNumbers)
@@ -369,7 +410,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
} else if (Ty->isAggregateType() || Ty->isVectorTy()) {
unsigned totalsz = DL.getTypeAllocSize(Ty);
unsigned retAlignment = 0;
- if (!llvm::getAlign(*F, 0, retAlignment))
+ if (!getAlign(*F, 0, retAlignment))
retAlignment = DL.getABITypeAlignment(Ty);
O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
<< "]";
@@ -401,7 +442,6 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
}
}
O << ") ";
- return;
}
void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
@@ -459,7 +499,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
MRI = &MF->getRegInfo();
F = MF->getFunction();
emitLinkageDirective(F, O);
- if (llvm::isKernelFunction(*F))
+ if (isKernelFunction(*F))
O << ".entry ";
else {
O << ".func ";
@@ -470,7 +510,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
emitFunctionParamList(*MF, O);
- if (llvm::isKernelFunction(*F))
+ if (isKernelFunction(*F))
emitKernelFunctionDirectives(*F, O);
OutStreamer->EmitRawText(O.str());
@@ -513,15 +553,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If none of reqntid* is specified, don't output reqntid directive.
unsigned reqntidx, reqntidy, reqntidz;
bool specified = false;
- if (!llvm::getReqNTIDx(F, reqntidx))
+ if (!getReqNTIDx(F, reqntidx))
reqntidx = 1;
else
specified = true;
- if (!llvm::getReqNTIDy(F, reqntidy))
+ if (!getReqNTIDy(F, reqntidy))
reqntidy = 1;
else
specified = true;
- if (!llvm::getReqNTIDz(F, reqntidz))
+ if (!getReqNTIDz(F, reqntidz))
reqntidz = 1;
else
specified = true;
@@ -535,15 +575,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// If none of maxntid* is specified, don't output maxntid directive.
unsigned maxntidx, maxntidy, maxntidz;
specified = false;
- if (!llvm::getMaxNTIDx(F, maxntidx))
+ if (!getMaxNTIDx(F, maxntidx))
maxntidx = 1;
else
specified = true;
- if (!llvm::getMaxNTIDy(F, maxntidy))
+ if (!getMaxNTIDy(F, maxntidy))
maxntidy = 1;
else
specified = true;
- if (!llvm::getMaxNTIDz(F, maxntidz))
+ if (!getMaxNTIDz(F, maxntidz))
maxntidz = 1;
else
specified = true;
@@ -553,11 +593,11 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
<< "\n";
unsigned mincta;
- if (llvm::getMinCTASm(F, mincta))
+ if (getMinCTASm(F, mincta))
O << ".minnctapersm " << mincta << "\n";
unsigned maxnreg;
- if (llvm::getMaxNReg(F, maxnreg))
+ if (getMaxNReg(F, maxnreg))
O << ".maxnreg " << maxnreg << "\n";
}
@@ -617,12 +657,9 @@ void NVPTXAsmPrinter::printVecModifiedImmediate(
llvm_unreachable("Unknown Modifier on immediate operand");
}
-
-
void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
-
emitLinkageDirective(F, O);
- if (llvm::isKernelFunction(*F))
+ if (isKernelFunction(*F))
O << ".entry ";
else
O << ".func ";
@@ -684,7 +721,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
if (!gv->hasInternalLinkage())
return false;
PointerType *Pty = gv->getType();
- if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+ if (Pty->getAddressSpace() != ADDRESS_SPACE_SHARED)
return false;
const Function *oneFunc = nullptr;
@@ -699,7 +736,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
}
static bool useFuncSeen(const Constant *C,
- llvm::DenseMap<const Function *, bool> &seenMap) {
+ DenseMap<const Function *, bool> &seenMap) {
for (const User *U : C->users()) {
if (const Constant *cu = dyn_cast<Constant>(U)) {
if (useFuncSeen(cu, seenMap))
@@ -719,7 +756,7 @@ static bool useFuncSeen(const Constant *C,
}
void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
- llvm::DenseMap<const Function *, bool> seenMap;
+ DenseMap<const Function *, bool> seenMap;
for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
const Function *F = &*FI;
@@ -1040,7 +1077,6 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
raw_ostream &O,
bool processDemoted) {
-
// Skip meta data
if (GVar->hasSection()) {
if (GVar->getSection() == "llvm.metadata")
@@ -1069,13 +1105,13 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
O << ".weak ";
}
- if (llvm::isTexture(*GVar)) {
- O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+ if (isTexture(*GVar)) {
+ O << ".global .texref " << getTextureName(*GVar) << ";\n";
return;
}
- if (llvm::isSurface(*GVar)) {
- O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+ if (isSurface(*GVar)) {
+ O << ".global .surfref " << getSurfaceName(*GVar) << ";\n";
return;
}
@@ -1088,8 +1124,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
return;
}
- if (llvm::isSampler(*GVar)) {
- O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+ if (isSampler(*GVar)) {
+ O << ".global .samplerref " << getSamplerName(*GVar);
const Constant *Initializer = nullptr;
if (GVar->hasInitializer())
@@ -1150,12 +1186,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
}
if (GVar->hasPrivateLinkage()) {
-
- if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+ if (strncmp(GVar->getName().data(), "unrollpragma", 12) == 0)
return;
// FIXME - need better way (e.g. Metadata) to avoid generating this global
- if (!strncmp(GVar->getName().data(), "filename", 8))
+ if (strncmp(GVar->getName().data(), "filename", 8) == 0)
return;
if (GVar->use_empty())
return;
@@ -1199,8 +1234,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
// Ptx allows variable initilization only for constant and global state
// spaces.
if (GVar->hasInitializer()) {
- if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
- (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+ if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) {
const Constant *Initializer = GVar->getInitializer();
// 'undef' is treated as there is no value specified.
if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
@@ -1233,8 +1268,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
ElementSize = DL.getTypeStoreSize(ETy);
// Ptx allows variable initilization only for constant and
// global state spaces.
- if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
- (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+ if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
GVar->hasInitializer()) {
const Constant *Initializer = GVar->getInitializer();
if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
@@ -1285,7 +1320,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
default:
llvm_unreachable("type not supported yet");
}
-
}
O << ";\n";
}
@@ -1305,16 +1339,16 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
raw_ostream &O) const {
switch (AddressSpace) {
- case llvm::ADDRESS_SPACE_LOCAL:
+ case ADDRESS_SPACE_LOCAL:
O << "local";
break;
- case llvm::ADDRESS_SPACE_GLOBAL:
+ case ADDRESS_SPACE_GLOBAL:
O << "global";
break;
- case llvm::ADDRESS_SPACE_CONST:
+ case ADDRESS_SPACE_CONST:
O << "const";
break;
- case llvm::ADDRESS_SPACE_SHARED:
+ case ADDRESS_SPACE_SHARED:
O << "shared";
break;
default:
@@ -1363,7 +1397,6 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
raw_ostream &O) {
-
const DataLayout &DL = getDataLayout();
// GlobalVariables are always constant pointers themselves.
@@ -1406,7 +1439,6 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
default:
llvm_unreachable("type not supported yet");
}
- return;
}
static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
@@ -1450,7 +1482,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
Function::const_arg_iterator I, E;
unsigned paramIndex = 0;
bool first = true;
- bool isKernelFunc = llvm::isKernelFunction(*F);
+ bool isKernelFunc = isKernelFunction(*F);
bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
MVT thePointerTy = TLI->getPointerTy(DL);
@@ -1533,13 +1565,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
default:
O << ".ptr ";
break;
- case llvm::ADDRESS_SPACE_CONST:
+ case ADDRESS_SPACE_CONST:
O << ".ptr .const ";
break;
- case llvm::ADDRESS_SPACE_SHARED:
+ case ADDRESS_SPACE_SHARED:
O << ".ptr .shared ";
break;
- case llvm::ADDRESS_SPACE_GLOBAL:
+ case ADDRESS_SPACE_GLOBAL:
O << ".ptr .global ";
break;
}
@@ -1820,7 +1852,6 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) {
void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
AggBuffer *aggBuffer) {
-
const DataLayout &DL = getDataLayout();
if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
@@ -1985,7 +2016,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
// buildTypeNameMap - Run through symbol table looking for type names.
//
-
bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
@@ -2100,7 +2130,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
raw_string_ostream OS(S);
OS << "Unsupported expression in static initializer: ";
CE->printAsOperand(OS, /*PrintType=*/ false,
- !MF ? 0 : MF->getFunction()->getParent());
+ !MF ? nullptr : MF->getFunction()->getParent());
report_fatal_error(OS.str());
}
@@ -2330,7 +2360,7 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
raw_ostream &O, const char *Modifier) {
printOperand(MI, opNum, O);
- if (Modifier && !strcmp(Modifier, "add")) {
+ if (Modifier && strcmp(Modifier, "add") == 0) {
O << ", ";
printOperand(MI, opNum + 1, O);
} else {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 3dcc0e358a14..8ec3476b8719 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -18,17 +18,34 @@
#include "NVPTX.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Value.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
// The ptx syntax and format is very different from that usually seem in a .s
// file,
@@ -40,7 +57,8 @@
// (subclass of MCStreamer).
namespace llvm {
- class MCOperand;
+
+class MCOperand;
class LineReader {
private:
@@ -49,14 +67,17 @@ private:
char buff[512];
std::string theFileName;
SmallVector<unsigned, 32> lineOffset;
+
public:
LineReader(std::string filename) {
theCurLine = 0;
fstr.open(filename.c_str());
theFileName = filename;
}
- std::string fileName() { return theFileName; }
+
~LineReader() { fstr.close(); }
+
+ std::string fileName() { return theFileName; }
std::string readLine(unsigned line);
};
@@ -107,6 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
numSymbols = 0;
EmitGeneric = AP.EmitGeneric;
}
+
unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
assert((curpos + Num) <= size);
assert((curpos + Bytes) <= size);
@@ -120,6 +142,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
}
return curpos;
}
+
unsigned addZeros(int Num) {
assert((curpos + Num) <= size);
for (int i = 0; i < Num; ++i) {
@@ -128,12 +151,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
}
return curpos;
}
+
void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) {
symbolPosInBuffer.push_back(curpos);
Symbols.push_back(GVar);
SymbolsBeforeStripping.push_back(GVarBeforeStripping);
numSymbols++;
}
+
void print() {
if (numSymbols == 0) {
// print out in bytes
@@ -267,7 +292,7 @@ private:
std::map<Type *, std::string> TypeNameMap;
// List of variables demoted to a function scope.
- std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+ std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
// To record filename to ID mapping
std::map<std::string, unsigned> filenameMap;
@@ -292,7 +317,8 @@ private:
bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
- LineReader *reader;
+ LineReader *reader = nullptr;
+
LineReader *getReader(const std::string &);
// Used to control the need to emit .generic() in the initializer of
@@ -312,20 +338,17 @@ public:
NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)),
EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
- NVPTX::CUDA) {
- CurrentBankselLabelInBasicBlock = "";
- reader = nullptr;
- }
+ NVPTX::CUDA) {}
- ~NVPTXAsmPrinter() {
- if (!reader)
- delete reader;
+ ~NVPTXAsmPrinter() override {
+ delete reader;
}
bool runOnMachineFunction(MachineFunction &F) override {
nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
return AsmPrinter::runOnMachineFunction(F);
}
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>();
AsmPrinter::getAnalysisUsage(AU);
@@ -338,6 +361,7 @@ public:
DebugLoc prevDebugLoc;
void emitLineNumberAsDotLoc(const MachineInstr &);
};
-} // end of namespace
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2e4764feff11..7a760fd38d0f 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1,3 +1,4 @@
+//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,31 +12,55 @@
//
//===----------------------------------------------------------------------===//
-#include "NVPTXISelLowering.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXSection.h"
+#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXTargetObjectFile.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
#undef DEBUG_TYPE
#define DEBUG_TYPE "nvptx-lower"
@@ -109,7 +134,6 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
const NVPTXSubtarget &STI)
: TargetLowering(TM), nvTM(&TM), STI(STI) {
-
// always lower memset, memcpy, and memmove intrinsics to load/store
// instructions, rather
// then generating calls to memset, mempcy or memmove.
@@ -981,7 +1005,7 @@ std::string NVPTXTargetLowering::getPrototype(
unsigned align = 0;
const CallInst *CallI = cast<CallInst>(CS->getInstruction());
// +1 because index 0 is reserved for return type alignment
- if (!llvm::getAlign(*CallI, i + 1, align))
+ if (!getAlign(*CallI, i + 1, align))
align = DL.getABITypeAlignment(Ty);
unsigned sz = DL.getTypeAllocSize(Ty);
O << ".param .align " << align << " .b8 ";
@@ -1047,7 +1071,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
// With bitcast'd call targets, the instruction will be the call
if (isa<CallInst>(CalleeI)) {
// Check if we have call alignment metadata
- if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+ if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
return Align;
const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
@@ -1070,7 +1094,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
// Check for function alignment information if we found that the
// ultimate target is a Function
if (DirectCallee)
- if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+ if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
return Align;
// Call is indirect or alignment information is not available, fall back to
@@ -1747,7 +1771,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
// {dHi, dLo} = {aHi, aLo} >> Amt
// dHi = aHi >> Amt
@@ -1761,7 +1784,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
else {
-
// {dHi, dLo} = {aHi, aLo} >> Amt
// - if (Amt>=size) then
// dLo = aHi >> (Amt-size)
@@ -1809,7 +1831,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
SDValue ShAmt = Op.getOperand(2);
if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
// {dHi, dLo} = {aHi, aLo} << Amt
// dHi = shf.l.clamp aLo, aHi, Amt
@@ -1823,7 +1844,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
else {
-
// {dHi, dLo} = {aHi, aLo} << Amt
// - if (Amt>=size) then
// dLo = aLo << Amt (all 0)
@@ -2002,11 +2022,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case 2:
Opcode = NVPTXISD::StoreV2;
break;
- case 4: {
+ case 4:
Opcode = NVPTXISD::StoreV4;
break;
}
- }
SmallVector<SDValue, 8> Ops;
@@ -2140,7 +2159,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
theArgs[i],
(theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
: nullptr))) {
- assert(llvm::isKernelFunction(*F) &&
+ assert(isKernelFunction(*F) &&
"Only kernels can have image/sampler params");
InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
continue;
@@ -2193,7 +2212,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
0);
assert(vtparts.size() > 0 && "empty aggregate type not expected");
bool aggregateIsPacked = false;
- if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+ if (StructType *STy = dyn_cast<StructType>(Ty))
aggregateIsPacked = STy->isPacked();
SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
@@ -2202,7 +2221,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
EVT partVT = vtparts[parti];
Value *srcValue = Constant::getNullValue(
PointerType::get(partVT.getTypeForEVT(F->getContext()),
- llvm::ADDRESS_SPACE_PARAM));
+ ADDRESS_SPACE_PARAM));
SDValue srcAddr =
DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
DAG.getConstant(offsets[parti], dl, PtrVT));
@@ -2242,7 +2261,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
if (NumElts == 1) {
// We only have one element, so just directly load it
Value *SrcValue = Constant::getNullValue(PointerType::get(
- EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
SDValue P = DAG.getLoad(
EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
@@ -2260,7 +2279,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// f32,f32 = load ...
EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
Value *SrcValue = Constant::getNullValue(PointerType::get(
- VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
SDValue P = DAG.getLoad(
VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
@@ -2301,7 +2320,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
for (unsigned i = 0; i < NumElts; i += VecSize) {
Value *SrcValue = Constant::getNullValue(
PointerType::get(VecVT.getTypeForEVT(F->getContext()),
- llvm::ADDRESS_SPACE_PARAM));
+ ADDRESS_SPACE_PARAM));
SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
DAG.getConstant(Ofst, dl, PtrVT));
SDValue P = DAG.getLoad(
@@ -2335,7 +2354,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// If ABI, load from the param symbol
SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
Value *srcValue = Constant::getNullValue(PointerType::get(
- ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
SDValue p;
if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
@@ -2424,7 +2443,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
DAG.getVTList(MVT::Other), Ops,
EltVT, MachinePointerInfo());
-
} else if (NumElts == 2) {
// V2 store
SDValue StoreVal0 = OutVals[0];
@@ -2558,7 +2576,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
}
-
void NVPTXTargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
@@ -3306,7 +3323,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.memVT = getValueType(DL, I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = true;
Info.align = 0;
@@ -3326,7 +3343,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.memVT = getValueType(DL, I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3347,7 +3364,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.memVT = getValueType(DL, I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3410,17 +3427,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
Info.opc = getOpcForTextureInstr(Intrinsic);
Info.memVT = MVT::v4f32;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
- }
+
case Intrinsic::nvvm_tex_1d_v4s32_s32:
case Intrinsic::nvvm_tex_1d_v4s32_f32:
case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
@@ -3532,17 +3549,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
- case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
Info.opc = getOpcForTextureInstr(Intrinsic);
Info.memVT = MVT::v4i32;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
- }
+
case Intrinsic::nvvm_suld_1d_i8_clamp:
case Intrinsic::nvvm_suld_1d_v2i8_clamp:
case Intrinsic::nvvm_suld_1d_v4i8_clamp:
@@ -3587,17 +3604,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
case Intrinsic::nvvm_suld_3d_i8_zero:
case Intrinsic::nvvm_suld_3d_v2i8_zero:
- case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+ case Intrinsic::nvvm_suld_3d_v4i8_zero:
Info.opc = getOpcForSurfaceInstr(Intrinsic);
Info.memVT = MVT::i8;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
- }
+
case Intrinsic::nvvm_suld_1d_i16_clamp:
case Intrinsic::nvvm_suld_1d_v2i16_clamp:
case Intrinsic::nvvm_suld_1d_v4i16_clamp:
@@ -3642,17 +3659,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
case Intrinsic::nvvm_suld_3d_i16_zero:
case Intrinsic::nvvm_suld_3d_v2i16_zero:
- case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+ case Intrinsic::nvvm_suld_3d_v4i16_zero:
Info.opc = getOpcForSurfaceInstr(Intrinsic);
Info.memVT = MVT::i16;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
- }
+
case Intrinsic::nvvm_suld_1d_i32_clamp:
case Intrinsic::nvvm_suld_1d_v2i32_clamp:
case Intrinsic::nvvm_suld_1d_v4i32_clamp:
@@ -3697,17 +3714,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
case Intrinsic::nvvm_suld_3d_i32_zero:
case Intrinsic::nvvm_suld_3d_v2i32_zero:
- case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+ case Intrinsic::nvvm_suld_3d_v4i32_zero:
Info.opc = getOpcForSurfaceInstr(Intrinsic);
Info.memVT = MVT::i32;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
- }
+
case Intrinsic::nvvm_suld_1d_i64_clamp:
case Intrinsic::nvvm_suld_1d_v2i64_clamp:
case Intrinsic::nvvm_suld_1d_array_i64_clamp:
@@ -3737,18 +3754,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_suld_2d_array_i64_zero:
case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
case Intrinsic::nvvm_suld_3d_i64_zero:
- case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+ case Intrinsic::nvvm_suld_3d_v2i64_zero:
Info.opc = getOpcForSurfaceInstr(Intrinsic);
Info.memVT = MVT::i64;
Info.ptrVal = nullptr;
Info.offset = 0;
- Info.vol = 0;
+ Info.vol = false;
Info.readMem = true;
Info.writeMem = false;
Info.align = 16;
return true;
}
- }
return false;
}
@@ -3760,7 +3776,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
-
// AddrMode - This represents an addressing mode of:
// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
//
@@ -4059,7 +4074,7 @@ static SDValue PerformANDCombine(SDNode *N,
}
bool AddTo = false;
- if (AExt.getNode() != 0) {
+ if (AExt.getNode() != nullptr) {
// Re-insert the ext as a zext.
Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
AExt.getValueType(), Val);
@@ -4204,7 +4219,6 @@ static bool IsMulWideOperandDemotable(SDValue Op,
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
unsigned OptSize,
bool &IsSigned) {
-
OperandSignedness LHSSign;
// The LHS operand must be a demotable op
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 92a88c7f2506..0fbb0448e4c4 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,7 +144,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
-def true : Predicate<"1">;
+def true : Predicate<"true">;
def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
index cad4f5668fdf..b0472de980fc 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -1,4 +1,4 @@
-//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -14,18 +14,20 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
-#include "llvm/IR/GlobalVariable.h"
#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
namespace llvm {
+
/// Represents a section in PTX PTX does not have sections. We create this class
/// in order to use the ASMPrint interface.
///
class NVPTXSection final : public MCSection {
virtual void anchor();
+
public:
NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
- ~NVPTXSection() {}
+ ~NVPTXSection() = default;
/// Override this as NVPTX has its own way of printing switching
/// to a section.
@@ -40,4 +42,4 @@ public:
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 6c68a2c9370d..eb357e0a4d50 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -11,41 +11,28 @@
//
//===----------------------------------------------------------------------===//
-#include "NVPTXTargetMachine.h"
-#include "MCTargetDesc/NVPTXMCAsmInfo.h"
#include "NVPTX.h"
#include "NVPTXAllocaHoisting.h"
#include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetMachine.h"
#include "NVPTXTargetObjectFile.h"
#include "NVPTXTargetTransformInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Vectorize.h"
+#include <cassert>
+#include <string>
using namespace llvm;
@@ -57,6 +44,7 @@ static cl::opt<bool>
cl::init(false), cl::Hidden);
namespace llvm {
+
void initializeNVVMIntrRangePass(PassRegistry&);
void initializeNVVMReflectPass(PassRegistry&);
void initializeGenericToNVVMPass(PassRegistry&);
@@ -66,7 +54,8 @@ void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
void initializeNVPTXLowerArgsPass(PassRegistry &);
void initializeNVPTXLowerAllocaPass(PassRegistry &);
-}
+
+} // end namespace llvm
extern "C" void LLVMInitializeNVPTXTarget() {
// Register the target.
@@ -109,7 +98,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
Reloc::PIC_, CM, OL),
is64bit(is64bit),
- TLOF(make_unique<NVPTXTargetObjectFile>()),
+ TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
if (TT.getOS() == Triple::NVCL)
drvInterface = NVPTX::NVCL;
@@ -118,7 +107,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
initAsmInfo();
}
-NVPTXTargetMachine::~NVPTXTargetMachine() {}
+NVPTXTargetMachine::~NVPTXTargetMachine() = default;
void NVPTXTargetMachine32::anchor() {}
@@ -141,6 +130,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
namespace {
+
class NVPTXPassConfig : public TargetPassConfig {
public:
NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
@@ -170,6 +160,7 @@ private:
// Add passes that perform straight-line scalar optimizations.
void addStraightLineScalarOptimizationPasses();
};
+
} // end anonymous namespace
TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index dc367a90594a..69c59d0296ab 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -11,14 +11,13 @@
#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
#include "NVPTXSection.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
-class GlobalVariable;
-class Module;
class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
-
public:
NVPTXTargetObjectFile() {
TextSection = nullptr;
@@ -43,7 +42,7 @@ public:
DwarfMacinfoSection = nullptr;
}
- virtual ~NVPTXTargetObjectFile();
+ ~NVPTXTargetObjectFile() override;
void Initialize(MCContext &ctx, const TargetMachine &TM) override {
TargetLoweringObjectFile::Initialize(ctx, TM);
@@ -52,7 +51,6 @@ public:
BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
ReadOnlySection =
new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
-
StaticCtorSection =
new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
StaticDtorSection =
@@ -102,4 +100,4 @@ public:
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 48928ee2d540..dd7707084948 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
int NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d953aa8a7199..b6c271ae4cbc 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
};
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f7785342b364..f94d1eab097d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -281,7 +281,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
// Fallback to the default implementation.
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8308086ccfaa..30ee2814aba1 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -71,7 +71,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2081809def70..2d0a06af18ae 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -547,8 +547,26 @@ bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
assert (isa<LoadInst>(I) || isa<StoreInst>(I));
Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
I->getOperand(0)->getType());
- if (!isUInt<12>(Offset) &&
- (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+ bool IsFPAccess = MemAccessTy->isFloatingPointTy();
+ bool IsVectorAccess = MemAccessTy->isVectorTy();
+
+ // A store of an extracted vector element will be combined into a VSTE type
+ // instruction.
+ if (!IsVectorAccess && isa<StoreInst>(I)) {
+ Value *DataOp = I->getOperand(0);
+ if (isa<ExtractElementInst>(DataOp))
+ IsVectorAccess = true;
+ }
+
+ // A load which gets inserted into a vector element will be combined into a
+ // VLE type instruction.
+ if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
+ User *LoadUser = *I->user_begin();
+ if (isa<InsertElementInst>(LoadUser))
+ IsVectorAccess = true;
+ }
+
+ if (!isUInt<12>(Offset) && (IsFPAccess || IsVectorAccess))
return false;
return true;
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
index e16ced1661a1..8a6d28490e8c 100644
--- a/contrib/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -44,7 +44,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
const TargetOptions &Options)
: TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
- RequireStructuredCFG(false), Options(Options) {
+ RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) {
if (EnableIPRA.getNumOccurrences())
this->Options.EnableIPRA = EnableIPRA;
}
@@ -63,14 +63,15 @@ bool TargetMachine::isPositionIndependent() const {
/// \brief Reset the target options based on the function's attributes.
// FIXME: This function needs to go away for a number of reasons:
// a) global state on the TargetMachine is terrible in general,
-// b) there's no default state here to keep,
-// c) these target options should be passed only on the function
+// b) these target options should be passed only on the function
// and not on the TargetMachine (via TargetOptions) at all.
void TargetMachine::resetTargetOptions(const Function &F) const {
#define RESET_OPTION(X, Y) \
do { \
if (F.hasFnAttribute(Y)) \
Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \
+ else \
+ Options.X = DefaultOptions.X; \
} while (0)
RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
@@ -87,6 +88,8 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
Options.FPDenormalMode = FPDenormal::PreserveSign;
else if (Denormal == "positive-zero")
Options.FPDenormalMode = FPDenormal::PositiveZero;
+ else
+ Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
}
/// Returns the code generation relocation model. The choices are static, PIC,
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 529540ea4ed2..bc7020fded8c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -663,6 +663,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
for (auto const &Arg : F->args())
MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+ if (!F->getReturnType()->isVoidTy())
+ MFI->addResult(getLegalType(getSimpleType(F->getReturnType())));
+
return true;
}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index d5474a02ce01..adf904ee0269 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -62,12 +62,19 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
// Recursively descend the def-use lists from V to find non-bitcast users of
// bitcasts of V.
static void FindUses(Value *V, Function &F,
- SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+ SmallVectorImpl<std::pair<Use *, Function *>> &Uses,
+ SmallPtrSetImpl<Constant *> &ConstantBCs) {
for (Use &U : V->uses()) {
if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
- FindUses(BC, F, Uses);
- else if (U.get()->getType() != F.getType())
+ FindUses(BC, F, Uses, ConstantBCs);
+ else if (U.get()->getType() != F.getType()) {
+ if (isa<Constant>(U.get())) {
+ // Only add constant bitcasts to the list once; they get RAUW'd
+ auto c = ConstantBCs.insert(cast<Constant>(U.get()));
+ if (!c.second) continue;
+ }
Uses.push_back(std::make_pair(&U, &F));
+ }
}
}
@@ -122,10 +129,10 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
bool FixFunctionBitcasts::runOnModule(Module &M) {
SmallVector<std::pair<Use *, Function *>, 0> Uses;
+ SmallPtrSet<Constant *, 2> ConstantBCs;
// Collect all the places that need wrappers.
- for (Function &F : M)
- FindUses(&F, F, Uses);
+ for (Function &F : M) FindUses(&F, F, Uses, ConstantBCs);
DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index bf546dab5fbb..47aadf99e860 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2a2e3941f82d..f658609f8930 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
/// @}
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index dc18a59a30ba..83a23d4ad680 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
- "Use 16-bit divide for positive values less than 65536">;
+ "Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
@@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
+ FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
@@ -760,6 +761,42 @@ def : Proc<"bdver4", [
FeatureMWAITX
]>;
+// TODO: The scheduler model falls to BTVER2 model.
+// The znver1 model has to be put in place.
+// Zen
+def: ProcessorModel<"znver1", BtVer2Model, [
+ FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureFastLZCNT,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSMAP,
+ FeatureSSE4A,
+ FeatureSlowSHLD,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES]>;
+
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8b66790679d9..8ab4c0616880 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -183,16 +183,6 @@ namespace {
void PreprocessISelDAG() override;
- inline bool immSext8(SDNode *N) const {
- return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
- }
-
- // True if the 64-bit immediate fits in a 32-bit sign-extended field.
- inline bool i64immSExt32(SDNode *N) const {
- uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
- return (int64_t)v == (int32_t)v;
- }
-
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index db76ddf04c06..787dff99367e 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass expensive divides on Atom when compiling with O2.
+ // Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
- addBypassSlowDiv(64, 16);
+ addBypassSlowDiv(64, 32);
}
if (Subtarget.isTargetKnownWindowsMSVC() ||
@@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
@@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
- if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
- }
+
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
@@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+ ArrayRef<int> Mask,const EVT &VectorType,
+ bool &IsZeroSideLeft) {
+ int NextElement = -1;
+ // Check if the Mask's nonzero elements are in increasing order.
+ for (int i = 0, e = Zeroable.size(); i < e; i++) {
+ // Checks if the mask's zeros elements are built from only zeros.
+ if (Mask[i] == -1)
+ return false;
+ if (Zeroable[i])
+ continue;
+ // Find the lowest non zero element
+ if (NextElement == -1) {
+ NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+ IsZeroSideLeft = NextElement != 0;
+ }
+ // Exit if the mask's non zero elements are not in increasing order.
+ if (NextElement != Mask[i])
+ return false;
+ NextElement++;
+ }
+ return true;
+}
+
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+ unsigned convertBit = 0;
+ for (int i = 0, e = Zeroable.size(); i < e; i++)
+ convertBit |= !(Zeroable[i]) << i;
+ return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsLeftZeroSide = true;
+ if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+ IsLeftZeroSide))
+ return SDValue();
+ unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements");
+ SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+ Subtarget, DAG, DL);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+ return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
+}
+
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
@@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Shift;
- // If we have VLX support, we can use VALIGN.
- if (Subtarget.hasVLX())
+ // If we have VLX support, we can use VALIGN or VEXPAND.
+ if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
// Try to use PALIGNR.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
Mask, Subtarget, DAG))
@@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
@@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Shift;
- // If we have VLX support, we can use VALIGN.
- if (Subtarget.hasVLX())
+ // If we have VLX support, we can use VALIGN or EXPAND.
+ if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
Mask, Subtarget, DAG))
return Rotate;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
@@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
@@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
@@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
@@ -15187,13 +15295,13 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
MVT InVT = In.getSimpleValueType();
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
- assert(InVT.getVectorElementType() == MVT::i1);
+ if (InVT.getVectorElementType() != MVT::i1)
+ return SDValue();
// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
MVT ExtVT = VT;
@@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
}
+ // Sometimes flags can be set either with an AND or with an SRL/SHL
+ // instruction. SRL/SHL variant should be preferred for masks longer than this
+ // number of bits.
+ const int ShiftToAndMaxMaskWidth = 32;
+ const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
// which may be the result of a CAST. We use the variable 'Op', which is the
// non-casted variable when we check for possible users.
@@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
- if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+ if (ZeroCheck && Op->hasOneUse() &&
isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
@@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
APInt Mask = ArithOp.getOpcode() == ISD::SRL
? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
- if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
break;
Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::AND:
// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
+ // because a TEST instruction will be better. However, AND should be
+ // preferred if the instruction can be combined into ANDN.
if (!hasNonFlagsUse(Op)) {
SDValue Op0 = ArithOp->getOperand(0);
SDValue Op1 = ArithOp->getOperand(1);
EVT VT = ArithOp.getValueType();
bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+ bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+ // If we cannot select an ANDN instruction, check if we can replace
+ // AND+IMM64 with a shift before giving up. This is possible for masks
+ // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+ if (!isProperAndn) {
+ if (!ZeroCheck)
+ break;
+
+ assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+ auto *CN = dyn_cast<ConstantSDNode>(Op1);
+ if (!CN)
+ break;
+
+ const APInt &Mask = CN->getAPIntValue();
+ if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+ break; // Prefer TEST instruction.
+
+ unsigned BitWidth = Mask.getBitWidth();
+ unsigned LeadingOnes = Mask.countLeadingOnes();
+ unsigned TrailingZeros = Mask.countTrailingZeros();
+
+ if (LeadingOnes + TrailingZeros == BitWidth) {
+ assert(TrailingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+ break;
+ }
+
+ unsigned LeadingZeros = Mask.countLeadingZeros();
+ unsigned TrailingOnes = Mask.countTrailingOnes();
+
+ if (LeadingZeros + TrailingOnes == BitWidth) {
+ assert(LeadingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+ break;
+ }
- // But if we can combine this into an ANDN operation, then create an AND
- // now and allow it to be pattern matched into an ANDN.
- if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
break;
+ }
}
LLVM_FALLTHROUGH;
case ISD::SUB:
@@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: {
- if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ if (!NeedTruncation && ZeroCheck) {
if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
return EFLAGS;
}
@@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
-
- if (VT.is512BitVector() && InVTElt != MVT::i1) {
+ if (VT.is512BitVector() && InVTElt != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- assert (InVTElt == MVT::i1 && "Unexpected vector type");
- MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ if (InVTElt != MVT::i1)
+ return SDValue();
+
+ MVT ExtVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
SDValue V;
if (Subtarget.hasDQI()) {
V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
@@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
- if (VT.is512BitVector())
+ if (ExtVT == VT)
return V;
}
@@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
- SDValue Op0 = ShAmt.getOperand(0);
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
- ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+ ShAmt = ShAmt.getOperand(0);
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
- ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
} else {
SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
if (VT == MVT::v16i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
- if (Subtarget.hasSSE41()) {
+ if (VT.is512BitVector()) {
+ // On AVX512BW targets we make use of the fact that VSELECT lowers
+ // to a masked blend which selects bytes based just on the sign bit
+ // extracted to a mask.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ } else if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
@@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() != ISD::VSELECT)
return SDValue();
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- // Check if the first operand is all zeros.This situation only
- // applies to avx512.
- if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // This situation only applies to avx512.
+ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+ CondVT.getVectorElementType() == MVT::i1) {
//Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
+ DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
+ DL, CondVT));
//Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
}
- assert(CondVT.isVector() && "Vector select expects a vector selector!");
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
@@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Combine:
+/// Combine brcond/cmov/setcc/.. based on comparing the result of
+/// atomic_load_add to use EFLAGS produced by the addition
+/// directly if possible. For example:
+///
+/// (setcc (cmp (atomic_load_add x, -C) C), COND_E)
+/// becomes:
+/// (setcc (LADD x, -C), COND_E)
+///
+/// and
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
-/// to:
+/// becomes:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
-/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+///
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
SelectionDAG &DAG) {
@@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
- // This only applies to variations of the common case:
+ // Can't replace the cmp if it has more uses than the one we're looking at.
+ // FIXME: We would like to be able to handle this, but would need to make sure
+ // all uses were updated.
+ if (!Cmp.hasOneUse())
+ return SDValue();
+
+ // This applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
return SDValue();
auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
- if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+ if (!CmpRHSC)
return SDValue();
+ APInt Comparand = CmpRHSC->getAPIntValue();
const unsigned Opc = CmpLHS.getOpcode();
@@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (Opc == ISD::ATOMIC_LOAD_SUB)
Addend = -Addend;
- if (CC == X86::COND_S && Addend == 1)
+ if (Comparand == -Addend) {
+ // No change to CC.
+ } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
CC = X86::COND_LE;
- else if (CC == X86::COND_NS && Addend == 1)
+ } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
CC = X86::COND_G;
- else if (CC == X86::COND_G && Addend == -1)
+ } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
CC = X86::COND_GE;
- else if (CC == X86::COND_LE && Addend == -1)
+ } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
CC = X86::COND_L;
- else
+ } else {
return SDValue();
+ }
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
@@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
/// Check if truncation with saturation form type \p SrcVT to \p DstVT
/// is valid for the given \p Subtarget.
-static bool
-isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+ const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX512())
return false;
+
+ // FIXME: Scalar type may be supported if we move it to vector register.
+ if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+ return false;
+
EVT SrcElVT = SrcVT.getScalarType();
EVT DstElVT = DstVT.getScalarType();
if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
@@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
return false;
}
+/// Return true if VPACK* instruction can be used for the given types
+/// and it is avalable on \p Subtarget.
+static bool
+isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+ if (Subtarget.hasSSE2())
+ // v16i16 -> v16i8
+ if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
+ return true;
+ if (Subtarget.hasSSE41())
+ // v8i32 -> v8i16
+ if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
+ return true;
+ return false;
+}
+
/// Detect a pattern of truncation with saturation:
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched or the unsupported on the current target.
-static SDValue
-detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
if (In.getOpcode() != ISD::UMIN)
return SDValue();
- EVT InVT = In.getValueType();
- // FIXME: Scalar type may be supported if we move it to vector register.
- if (!InVT.isVector() || !InVT.isSimple())
- return SDValue();
-
- if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
- return SDValue();
-
//Saturation with truncation. We truncate from InVT to VT.
- assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+ assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation");
- SDValue SrcVal;
APInt C;
- if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
- SrcVal = In.getOperand(1);
- else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
- SrcVal = In.getOperand(0);
- else
+ if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+ // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
+ SDValue();
+ }
+ return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
return SDValue();
+ return detectUSatPattern(In, VT);
+}
- // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
- // the element size of the destination type.
- return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
- SrcVal : SDValue();
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue USatVal = detectUSatPattern(In, VT);
+ if (USatVal) {
+ if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
+ return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
+ }
+ }
+ return SDValue();
}
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
@@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getMemOperand()->getFlags());
if (SDValue Val =
- detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+ detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
@@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
- // Try the truncation with unsigned saturation.
- if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
- return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+ // Try to combine truncation with unsigned saturation.
+ if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+ return Val;
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index d44d1395f243..230d1700b8d2 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in {
(VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
} // Predicates = [HasAVX512]
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+} // Predicates = [HasAVX512]
+
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
@@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)),
(COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
(COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
//===----------------------------------------------------------------------===//
// AVX-512 Vector convert from signed/unsigned integer to float/double
// and from float/double to signed/unsigned integer
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 09971d586a41..1812d01711d1 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
InstrItinClass ri = arg_ri;
}
-
// scalar
let Sched = WriteFAdd in {
def SSE_ALU_F32S : OpndItins<
@@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
}
} // isCodeGenOnly = 1
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index 92c16214aa4a..d80dc4a9b5e8 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -216,7 +216,7 @@ protected:
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
- /// True if 16-bit divides are significantly faster than
+ /// True if 32-bit divides are significantly faster than
/// 64-bit divisions and should be used when possible.
bool HasSlowDivide64;
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 107ed9359376..5715d826862e 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry SLMCostTable[] = {
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ // v2i64/v4i64 mul is custom lowered as a series of long
+ // multiplies(3), shifts(3) and adds(2).
+ // slm muldq version throughput is 2
+ { ISD::MUL, MVT::v2i64, 11 },
+ };
+
+ if (ST->isSLM()) {
+ if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+ // Check if the operands can be shrinked into a smaller datatype.
+ bool Op1Signed = false;
+ unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+ bool Op2Signed = false;
+ unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+ bool signedMode = Op1Signed | Op2Signed;
+ unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+ if (OpMinSize <= 7)
+ return LT.first * 3; // pmullw/sext
+ if (!signedMode && OpMinSize <= 8)
+ return LT.first * 3; // pmullw/zext
+ if (OpMinSize <= 15)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ if (!signedMode && OpMinSize <= 16)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ }
+ if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+ LT.second)) {
+ return LT.first * Entry->Cost;
+ }
+ }
+
if (ISD == ISD::SDIV &&
Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
+
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index c013805f4321..ecaaf951cff7 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);