aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/X86.td3
-rw-r--r--lib/Target/X86/X86CallingConv.td42
-rw-r--r--lib/Target/X86/X86FastISel.cpp7
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp269
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp162
-rw-r--r--lib/Target/X86/X86InstrAVX512.td132
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td6
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp4
-rw-r--r--lib/Target/X86/X86InstrInfo.td13
-rw-r--r--lib/Target/X86/X86InstrTSX.td5
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp24
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp19
-rw-r--r--lib/Target/X86/X86RegisterInfo.td4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp7
-rw-r--r--lib/Target/X86/X86Subtarget.h15
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp15
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp36
17 files changed, 497 insertions, 266 deletions
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+ "LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
+ FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 6781d761a1c4..7d146d050a5c 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -73,8 +73,8 @@ def CC_#NAME : CallingConv<[
CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
CCIfByVal<CCPassByVal<4, 4>>,
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// Promote v8i1/v16i1/v32i1 arguments to i32.
CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
@@ -146,8 +146,8 @@ def CC_#NAME : CallingConv<[
]>;
def RetCC_#NAME : CallingConv<[
- // Promote i1, v8i1 arguments to i8.
- CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+ // Promote i1, v1i1, v8i1 arguments to i8.
+ CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>,
// Promote v16i1 arguments to i16.
CCIfType<[v16i1], CCPromoteToType<i16>>,
@@ -207,6 +207,7 @@ def RetCC_X86Common : CallingConv<[
//
// For code that doesn't care about the ABI, we allow returning more than two
// integer values in registers.
+ CCIfType<[v1i1], CCPromoteToType<i8>>,
CCIfType<[i1], CCPromoteToType<i8>>,
CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
@@ -375,6 +376,7 @@ def RetCC_X86_64_Swift : CallingConv<[
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
// For integers, ECX, R8D can be used as extra return registers.
+ CCIfType<[v1i1], CCPromoteToType<i8>>,
CCIfType<[i1], CCPromoteToType<i8>>,
CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
@@ -485,8 +487,8 @@ def CC_X86_64_C : CallingConv<[
// Handles byval parameters.
CCIfByVal<CCPassByVal<8, 8>>,
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// The 'nest' parameter, if any, is passed in R10.
CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
@@ -584,8 +586,8 @@ def CC_X86_Win64_C : CallingConv<[
// FIXME: Handle byval stuff.
// FIXME: Handle varargs.
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// The 'nest' parameter, if any, is passed in R10.
CCIfNest<CCAssignToReg<[R10]>>,
@@ -796,8 +798,8 @@ def CC_X86_32_Common : CallingConv<[
]>;
def CC_X86_32_C : CallingConv<[
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// The 'nest' parameter, if any, is passed in ECX.
CCIfNest<CCAssignToReg<[ECX]>>,
@@ -816,8 +818,8 @@ def CC_X86_32_MCU : CallingConv<[
// puts arguments in registers.
CCIfByVal<CCPassByVal<4, 4>>,
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// If the call is not a vararg call, some arguments may be passed
// in integer registers.
@@ -828,8 +830,8 @@ def CC_X86_32_MCU : CallingConv<[
]>;
def CC_X86_32_FastCall : CallingConv<[
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// The 'nest' parameter, if any, is passed in EAX.
CCIfNest<CCAssignToReg<[EAX]>>,
@@ -858,15 +860,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[
]>;
def CC_X86_32_ThisCall_Mingw : CallingConv<[
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
CCDelegateTo<CC_X86_32_ThisCall_Common>
]>;
def CC_X86_32_ThisCall_Win : CallingConv<[
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// Pass sret arguments indirectly through stack.
CCIfSRet<CCAssignToStack<4, 4>>,
@@ -885,8 +887,8 @@ def CC_X86_32_FastCC : CallingConv<[
// puts arguments in registers.
CCIfByVal<CCPassByVal<4, 4>>,
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+ // Promote i1/i8/i16/v1i1 arguments to i32.
+ CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
// The 'nest' parameter, if any, is passed in EAX.
CCIfNest<CCAssignToReg<[EAX]>>,
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index fc3b4836c178..3cfb924abd01 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3647,13 +3647,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
- if (Subtarget->hasAVX512()) {
- // Need to copy to a VK1 register.
- unsigned ResultReg = createResultReg(&X86::VK1RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg);
- return ResultReg;
- }
case MVT::i8:
return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
X86::sub_8bit);
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- static char ID;
+
/// \brief Loop over all of the instructions in the basic block
/// replacing applicable instructions with LEA instructions,
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- StringRef getPassName() const override { return "X86 LEA Fixup"; }
/// \brief Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
+
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on SNB+ try to replace it with other instructions.
+ /// According to Intel's Optimization Reference Manual:
+ /// " For LEA instructions with three source operands and some specific
+ /// situations, instruction latency has increased to 3 cycles, and must
+ /// dispatch via port 1:
+ /// - LEA that has all three source operands: base, index, and offset
+ /// - LEA that uses base and index registers where the base is EBP, RBP,
+ /// or R13
+ /// - LEA that uses RIP relative addressing mode
+ /// - LEA that uses 16-bit addressing mode "
+ /// This function currently handles the first 2 cases only.
+ MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI);
+
/// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
/// and convert them to INC or DEC respectively.
bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
MachineBasicBlock::iterator &MBBI) const;
public:
- FixupLEAPass() : MachineFunctionPass(ID) {}
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+ FixupLEAPass() : MachineFunctionPass(ID) {
+ initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+ }
/// \brief Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
bool OptIncDec;
bool OptLEA;
};
-char FixupLEAPass::ID = 0;
}
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
- OptLEA = ST.LEAusesAG() || ST.slowLEA();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
if (!OptLEA && !OptIncDec)
return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int opcode) {
- return opcode == X86::LEA16r || opcode == X86::LEA32r ||
- opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+ return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+ return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+ const MachineOperand &Index) {
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+ isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+ return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+ const MachineOperand &Index,
+ const MachineOperand &Offset) {
+ return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return X86::ADD16rr;
+ case X86::LEA32r:
+ return X86::ADD32rr;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return X86::ADD64rr;
+ }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+ bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+ case X86::LEA64r:
+ return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+ }
}
/// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineInstr &MI = *I;
- const int opcode = MI.getOpcode();
- if (!isLEA(opcode))
+ const int Opcode = MI.getOpcode();
+ if (!isLEA(Opcode))
return;
if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
!TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
if (MI.getOperand(2).getImm() > 1)
return;
- int addrr_opcode, addri_opcode;
- switch (opcode) {
- default:
- llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- addrr_opcode = X86::ADD16rr;
- addri_opcode = X86::ADD16ri;
- break;
- case X86::LEA32r:
- addrr_opcode = X86::ADD32rr;
- addri_opcode = X86::ADD32ri;
- break;
- case X86::LEA64_32r:
- case X86::LEA64r:
- addrr_opcode = X86::ADD64rr;
- addri_opcode = X86::ADD64ri32;
- break;
- }
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
DEBUG(dbgs() << "FixLEA: Replaced by: ";);
MachineInstr *NewMI = nullptr;
- const MachineOperand &Dst = MI.getOperand(0);
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
- const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
- .add(Dst)
- .add(Src1)
- .add(Src2);
- MFI->insert(I, NewMI);
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+ const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI =
+ BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
if (MI.getOperand(4).getImm() != 0) {
+ const MCInstrDesc &ADDri =
+ TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
- .add(Dst)
+ NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(MI.getOperand(4).getImm());
- MFI->insert(I, NewMI);
DEBUG(NewMI->dump(););
}
if (NewMI) {
MFI->erase(I);
- I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ I = NewMI;
+ }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI) {
+
+ const int LEAOpcode = MI.getOpcode();
+ if (!isLEA(LEAOpcode))
+ return nullptr;
+
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(1);
+ const MachineOperand &Scale = MI.getOperand(2);
+ const MachineOperand &Index = MI.getOperand(3);
+ const MachineOperand &Offset = MI.getOperand(4);
+ const MachineOperand &Segment = MI.getOperand(5);
+
+ if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+ hasInefficientLEABaseReg(Base, Index)) ||
+ !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+ Segment.getReg() != X86::NoRegister)
+ return nullptr;
+
+ unsigned int DstR = Dst.getReg();
+ unsigned int BaseR = Base.getReg();
+ unsigned int IndexR = Index.getReg();
+ unsigned SSDstR =
+ (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+ bool IsScale1 = Scale.getImm() == 1;
+ bool IsInefficientBase = isInefficientLEAReg(BaseR);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+ // Skip these cases since it takes more than 2 instructions
+ // to replace the LEA instruction.
+ if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+ return nullptr;
+ if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+ (IsInefficientIndex || !IsScale1))
+ return nullptr;
+
+ const DebugLoc DL = MI.getDebugLoc();
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+ const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+ // First try to replace LEA with one or two (for the 3-op LEA case)
+ // add instructions:
+ // 1.lea (%base,%index,1), %base => add %index,%base
+ // 2.lea (%base,%index,1), %index => add %base,%index
+ if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+ const MachineOperand &Src = DstR == BaseR ? Index : Base;
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // Handle the rest of the cases with inefficient base register:
+ assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+ assert(IsInefficientBase && "efficient base should be handled already!");
+
+ // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+ if (IsScale1 && !hasLEAOffset(Offset)) {
+ TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+ DEBUG(MI.getPrevNode()->dump(););
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
+ // lea offset(%base,%index,scale), %dst =>
+ // lea offset( ,%index,scale), %dst; add %base,%dst
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+
+ NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
if (OptLEA) {
if (MF.getSubtarget<X86Subtarget>().isSLM())
processInstructionForSLM(I, MFI);
- else
- processInstruction(I, MFI);
+
+ else {
+ if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+ if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+ MFI->erase(I);
+ I = NewMI;
+ }
+ } else
+ processInstruction(I, MFI);
+ }
}
}
return false;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 11c08292518a..37b248416e4a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1140,7 +1140,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
- addRegisterClass(MVT::i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
@@ -1155,16 +1155,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
}
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::SETCC, MVT::i1, Custom);
- setOperationAction(ISD::SETCCE, MVT::i1, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
- setOperationAction(ISD::XOR, MVT::i1, Legal);
- setOperationAction(ISD::OR, MVT::i1, Legal);
- setOperationAction(ISD::AND, MVT::i1, Legal);
- setOperationAction(ISD::SUB, MVT::i1, Custom);
- setOperationAction(ISD::ADD, MVT::i1, Custom);
- setOperationAction(ISD::MUL, MVT::i1, Custom);
for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
@@ -1233,7 +1223,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
- setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
@@ -1311,7 +1300,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
@@ -1699,7 +1690,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
- return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+ return MVT::i8;
if (VT.isSimple()) {
MVT VVT = VT.getSimpleVT();
@@ -2480,6 +2471,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
SelectionDAG &DAG) {
SDValue ValReturned = ValArg;
+ if (ValVT == MVT::v1i1)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
+
if (ValVT == MVT::v64i1) {
// In 32 bit machine, this case is handled by getv64i1Argument
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
@@ -2502,7 +2496,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
}
-
return DAG.getBitcast(ValVT, ValReturned);
}
@@ -2809,8 +2802,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
- return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
- : Val;
+ return ExtendedInMem
+ ? (VA.getValVT().isVector()
+ ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
+ : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
+ : Val;
}
// FIXME: Get this from tablegen.
@@ -2960,7 +2956,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
- else if (RegVT == MVT::i1)
+ else if (RegVT == MVT::v1i1)
RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
@@ -6871,7 +6867,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (!In.isUndef())
- Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
}
SDLoc dl(Op);
MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
@@ -6914,7 +6910,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(In))
NonConstIdx.push_back(idx);
else {
- Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
HasConstElts = true;
}
if (SplatIdx < 0)
@@ -13946,7 +13942,6 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
SDValue Idx = Op.getOperand(1);
MVT EltVT = Op.getSimpleValueType();
- assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
@@ -13980,8 +13975,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(MaxSift, dl, MVT::i8));
- return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
- DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
+ DAG.getIntPtrConstant(0, dl));
}
SDValue
@@ -13992,7 +13987,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
- if (Op.getSimpleValueType() == MVT::i1)
+ if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG);
if (!isa<ConstantSDNode>(Idx)) {
@@ -14163,10 +14158,13 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
return EltInVec;
}
- // Insertion of one bit into first or last position
- // can be done with two SHIFTs + OR.
+ // Insertion of one bit into first position
if (IdxVal == 0 ) {
- // EltInVec already at correct index and other bits are 0.
+ // Clean top bits of vector.
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
+ DAG.getConstant(NumElems - 1, dl, MVT::i8));
+ EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
+ DAG.getConstant(NumElems - 1, dl, MVT::i8));
// Clean the first bit in source vector.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(1 , dl, MVT::i8));
@@ -14175,6 +14173,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
+ // Insertion of one bit into last position
if (IdxVal == NumElems -1) {
// Move the bit to the last position inside the vector.
EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
@@ -17322,8 +17321,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
- assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
- && "SetCC type must be 8-bit or 1-bit integer");
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc dl(Op);
@@ -17457,7 +17455,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SSECC != 8) {
if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
DL, VT, Cmp, Op1, Op2);
@@ -17505,9 +17503,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
// AVX512 fallback is to lower selects of scalar floats to masked moves.
- if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
- Subtarget.hasAVX512())
- return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+ if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+ SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+ }
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
@@ -19048,8 +19047,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
/// \brief Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is coming as MVT::i8 and it should be truncated
-/// to MVT::i1 while lowering masking intrinsics.
+/// The mask is coming as MVT::i8 and it should be transformed
+/// to MVT::v1i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
/// "X86select" instead of "vselect". We just can't create the "vselect" node
/// for a scalar instruction.
@@ -19064,11 +19063,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- // The mask should be of type MVT::i1
- SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+ SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
if (Op.getOpcode() == X86ISD::FSETCCM ||
- Op.getOpcode() == X86ISD::FSETCCM_RND)
+ Op.getOpcode() == X86ISD::FSETCCM_RND)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
if (Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
@@ -19507,10 +19505,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
- SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
}
case CMP_MASK:
case CMP_MASK_CC: {
@@ -19570,18 +19569,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
}
//default rounding mode
if(!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
DAG.getTargetConstant(0, dl,
MVT::i1),
Subtarget, DAG);
-
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
+ DAG.getIntPtrConstant(0, dl));
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -19629,13 +19628,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
- FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8));
+ FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8));
else
- FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8), Sae);
- // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
- return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+ FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
+ DAG.getIntPtrConstant(0, dl));
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -23385,8 +23384,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
"Unexpected request for vector widening");
- EVT EltVT = NVT.getVectorElementType();
-
SDLoc dl(InOp);
if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
InOp.getNumOperands() == 2) {
@@ -23404,6 +23401,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
for (unsigned i = 0; i < InNumElts; ++i)
Ops.push_back(InOp.getOperand(i));
+ EVT EltVT = InOp.getOperand(0).getValueType();
+
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
DAG.getUNDEF(EltVT);
for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
@@ -24709,16 +24708,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
// xbegin sinkMBB
//
// mainMBB:
- // eax = -1
+ // s0 = -1
+ //
+ // fallBB:
+ // eax = # XABORT_DEF
+ // s1 = eax
//
// sinkMBB:
- // v = eax
+ // v = phi(s0/mainBB, s1/fallBB)
MachineBasicBlock *thisMBB = MBB;
MachineFunction *MF = MBB->getParent();
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
+ MF->insert(I, fallMBB);
MF->insert(I, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -24726,25 +24731,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned fallDstReg = MRI.createVirtualRegister(RC);
+
// thisMBB:
- // xbegin sinkMBB
+ // xbegin fallMBB
// # fallthrough to mainMBB
- // # abortion to sinkMBB
- BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ // # abortion to fallMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
thisMBB->addSuccessor(mainMBB);
- thisMBB->addSuccessor(sinkMBB);
+ thisMBB->addSuccessor(fallMBB);
// mainMBB:
- // EAX = -1
- BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ // mainDstReg := -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+ BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
mainMBB->addSuccessor(sinkMBB);
- // sinkMBB:
- // EAX is live into the sinkMBB
- sinkMBB->addLiveIn(X86::EAX);
- BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
- MI.getOperand(0).getReg())
+ // fallMBB:
+ // ; pseudo instruction to model hardware's definition from XABORT
+ // EAX := XABORT_DEF
+ // fallDstReg := EAX
+ BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
+ BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
.addReg(X86::EAX);
+ fallMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(fallDstReg).addMBB(fallMBB);
MI.eraseFromParent();
return sinkMBB;
@@ -29574,7 +29594,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
DAG.getAllOnesConstant(DL, CondVT));
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
@@ -31321,13 +31341,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget.hasAVX512()) {
- SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
- CMP01,
- DAG.getConstant(x86cc, DL, MVT::i8));
- if (N->getValueType(0) != MVT::i1)
- return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
- FSetCC);
- return FSetCC;
+ SDValue FSetCC =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
+ DAG.getConstant(x86cc, DL, MVT::i8));
+ return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
+ FSetCC, DAG.getIntPtrConstant(0, DL));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 71d395244b4a..f9344413bbcf 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -31,8 +31,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
// The mask VT.
- ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
- "v" # NumElts # "i1"));
+ ValueType KVT = !cast<ValueType>("v" # NumElts # "i1");
// Suffix used in the instruction mnemonic.
string Suffix = suffix;
@@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512, NoDQI] in {
let Predicates = [HasAVX512] in {
def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
(KMOVWmk addr:$dst, VK16:$src)>;
- def : Pat<(i1 (load addr:$src)),
+ def : Pat<(v1i1 (load addr:$src)),
(COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
(KMOVWkm addr:$src)>;
@@ -2280,77 +2279,45 @@ let Predicates = [HasBWI] in {
}
let Predicates = [HasAVX512] in {
- def : Pat<(i1 (trunc (i64 GR64:$src))),
- (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
- (i32 1)), VK1)>;
+ multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> {
+ def : Pat<(maskVT (scalar_to_vector GR32:$src)),
+ (COPY_TO_REGCLASS GR32:$src, maskRC)>;
- def : Pat<(i1 (trunc (i32 GR32:$src))),
- (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>;
+ def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS maskRC:$src, GR32)>;
- def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
- (COPY_TO_REGCLASS GR32:$src, VK1)>;
+ def : Pat<(maskVT (scalar_to_vector GR8:$src)),
+ (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
- def : Pat<(i1 (trunc (i8 GR8:$src))),
- (COPY_TO_REGCLASS
- (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit), (i32 1)), VK1)>;
+ def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
- def : Pat<(i1 (trunc (i16 GR16:$src))),
- (COPY_TO_REGCLASS
- (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR16:$src, sub_16bit), (i32 1)), VK1)>;
-
- def : Pat<(i32 (zext VK1:$src)),
- (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>;
-
- def : Pat<(i32 (anyext VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, GR32)>;
-
- def : Pat<(i8 (zext VK1:$src)),
- (EXTRACT_SUBREG
- (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>;
-
- def : Pat<(i8 (anyext VK1:$src)),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
+ def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
+ (COPY_TO_REGCLASS maskRC:$src, GR32)>;
+ }
- def : Pat<(i64 (zext VK1:$src)),
- (SUBREG_TO_REG (i64 0),
- (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>;
+ defm : operation_gpr_mask_copy_lowering<VK1, v1i1>;
+ defm : operation_gpr_mask_copy_lowering<VK2, v2i1>;
+ defm : operation_gpr_mask_copy_lowering<VK4, v4i1>;
+ defm : operation_gpr_mask_copy_lowering<VK8, v8i1>;
+ defm : operation_gpr_mask_copy_lowering<VK16, v16i1>;
+ defm : operation_gpr_mask_copy_lowering<VK32, v32i1>;
+ defm : operation_gpr_mask_copy_lowering<VK64, v64i1>;
- def : Pat<(i64 (anyext VK1:$src)),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
+ def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit), (i32 1))), VK1)>;
+ def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit), (i32 1))), VK16)>;
+ def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit), (i32 1))), VK8)>;
- def : Pat<(i16 (zext VK1:$src)),
- (EXTRACT_SUBREG
- (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>;
-
- def : Pat<(i16 (anyext VK1:$src)),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
-}
-def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK16)>;
-def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK8)>;
-def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK4)>;
-def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK2)>;
-def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK32)>;
-def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK64)>;
-
-def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-
-def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>;
+}
// Mask unary operation
// - KNOT
@@ -2551,14 +2518,11 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+ def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>;
def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
- let AddedComplexity = 10 in { // To optimize isel table.
- def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
- def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
- def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
- }
+ def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
}
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
@@ -2570,6 +2534,12 @@ multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subV
def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
(VT (COPY_TO_REGCLASS subRC:$src, RC))>;
}
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>;
defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>;
defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>;
@@ -3249,7 +3219,7 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
def : Pat<(_.VT (OpNode _.RC:$src0,
(_.VT (scalar_to_vector
- (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
(_.EltVT _.FRC:$src1),
(_.EltVT _.FRC:$src2))))))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
@@ -3260,7 +3230,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
def : Pat<(_.VT (OpNode _.RC:$src0,
(_.VT (scalar_to_vector
- (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
(_.EltVT _.FRC:$src1),
(_.EltVT ZeroFP))))))),
(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
@@ -3279,7 +3249,7 @@ def : Pat<(masked_store addr:$dst, Mask,
(iPTR 0))),
(iPTR 0)))),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
- (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
(COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
}
@@ -3296,7 +3266,7 @@ def : Pat<(masked_store addr:$dst, Mask,
(iPTR 0))),
(iPTR 0)))),
(!cast<Instruction>(InstrStr#mrk) addr:$dst,
- (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
(COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
}
@@ -3310,7 +3280,7 @@ def : Pat<(_.info128.VT (extract_subvector
(v16i32 immAllZerosV))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
- (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
addr:$srcAddr)>;
def : Pat<(_.info128.VT (extract_subvector
@@ -3322,7 +3292,7 @@ def : Pat<(_.info128.VT (extract_subvector
(iPTR 0))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
- (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
addr:$srcAddr)>;
}
@@ -3338,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector
(v16i32 immAllZerosV))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmkz)
- (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
addr:$srcAddr)>;
def : Pat<(_.info128.VT (extract_subvector
@@ -3350,7 +3320,7 @@ def : Pat<(_.info128.VT (extract_subvector
(iPTR 0))))),
(iPTR 0))),
(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
- (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
addr:$srcAddr)>;
}
@@ -3381,7 +3351,7 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
- (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)),
+ (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
(COPY_TO_REGCLASS VR128X:$src, FR32X))>;
let hasSideEffects = 0 in
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 9867ba84bb9b..e2e228f5544b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -274,7 +274,7 @@ def X86select : SDNode<"X86ISD::SELECT",
SDTCisSameNumEltsAs<0, 1>]>>;
def X86selects : SDNode<"X86ISD::SELECTS",
- SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+ SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<2, 3>]>>;
@@ -441,7 +441,7 @@ def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
SDTCisSameNumEltsAs<0,1>,
SDTCisVT<2, i32>]>, []>;
def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
- SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+ SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
@@ -451,7 +451,7 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
- [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+ [SDTCisVec<1>,
SDTCisPtrTy<2>]>, []>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 092ceb207ada..f7083a7448ce 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -10511,9 +10511,7 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
MachineFunction &MF,
- bool IsTailCall) const {
- return;
-}
+ bool IsTailCall) const {}
MachineBasicBlock::iterator
X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 4d7d8ece92d9..01df07e1715f 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -896,9 +896,16 @@ def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
"TM.getCodeModel() == CodeModel::Kernel">;
def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
-def OptForSize : Predicate<"Subtarget->getOptForSize()">;
-def OptForMinSize : Predicate<"Subtarget->getOptForMinSize()">;
-def OptForSpeed : Predicate<"!Subtarget->getOptForSize()">;
+
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+ def OptForSize : Predicate<"MF->getFunction()->optForSize()">;
+ def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">;
+ def OptForSpeed : Predicate<"!MF->getFunction()->optForSize()">;
+}
+
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 38ac8be94483..61aac58a491f 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -30,6 +30,11 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
"xbegin\t$dst", []>, OpSize32;
}
+// Psuedo instruction to fake the definition of EAX on the fallback code path.
+let isPseudo = 1, Defs = [EAX] in {
+def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
+}
+
def XEND : I<0x01, MRM_D5, (outs), (ins),
"xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 5eb5ad52840a..61956f741820 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -449,24 +449,30 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
if (!SrcRC)
return false;
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
- return false;
- }
-
+ unsigned SubIdx;
if (DstRC == SrcRC) {
// Nothing to be done
+ SubIdx = X86::NoSubRegister;
} else if (DstRC == &X86::GR32RegClass) {
- I.getOperand(1).setSubReg(X86::sub_32bit);
+ SubIdx = X86::sub_32bit;
} else if (DstRC == &X86::GR16RegClass) {
- I.getOperand(1).setSubReg(X86::sub_16bit);
+ SubIdx = X86::sub_16bit;
} else if (DstRC == &X86::GR8RegClass) {
- I.getOperand(1).setSubReg(X86::sub_8bit);
+ SubIdx = X86::sub_8bit;
} else {
return false;
}
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ I.getOperand(1).setSubReg(SubIdx);
+
I.setDesc(TII.get(X86::COPY));
return true;
}
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 8ce240714f17..da724f5d8989 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -184,6 +184,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
return;
const LLT s64 = LLT::scalar(64);
+ const LLT v16s8 = LLT::vector(16, 8);
const LLT v8s16 = LLT::vector(8, 16);
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
@@ -193,7 +194,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
setAction({BinOp, Ty}, Legal);
for (unsigned BinOp : {G_ADD, G_SUB})
- for (auto Ty : {v4s32})
+ for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
setAction({BinOp, Ty}, Legal);
setAction({G_MUL, v8s16}, Legal);
@@ -212,8 +213,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() {
if (!Subtarget.hasAVX2())
return;
+ const LLT v32s8 = LLT::vector(32, 8);
const LLT v16s16 = LLT::vector(16, 16);
const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+ setAction({BinOp, Ty}, Legal);
for (auto Ty : {v16s16, v8s32})
setAction({G_MUL, Ty}, Legal);
@@ -224,6 +231,11 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
return;
const LLT v16s32 = LLT::vector(16, 32);
+ const LLT v8s64 = LLT::vector(8, 64);
+
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v16s32, v8s64})
+ setAction({BinOp, Ty}, Legal);
setAction({G_MUL, v16s32}, Legal);
@@ -261,8 +273,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX512BW() {
if (!(Subtarget.hasAVX512() && Subtarget.hasBWI()))
return;
+ const LLT v64s8 = LLT::vector(64, 8);
const LLT v32s16 = LLT::vector(32, 16);
+ for (unsigned BinOp : {G_ADD, G_SUB})
+ for (auto Ty : {v64s8, v32s16})
+ setAction({BinOp, Ty}, Legal);
+
setAction({G_MUL, v32s16}, Legal);
/************ VLX *******************/
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index d235d2b40b15..3a61a7247c72 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -511,7 +511,7 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 31)>;
// Mask registers
-def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;}
def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;}
def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;}
@@ -519,7 +519,7 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
-def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;}
+def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;}
def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;}
def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;}
def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d66d39dcee17..2b1f43bffd71 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -320,6 +320,7 @@ void X86Subtarget::initializeEnvironment() {
CallRegIndirect = false;
LEAUsesAG = false;
SlowLEA = false;
+ Slow3OpsLEA = false;
SlowIncDec = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
@@ -336,8 +337,7 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,
- unsigned StackAlignOverride, bool OptForSize,
- bool OptForMinSize)
+ unsigned StackAlignOverride)
: X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
StackAlignOverride(StackAlignOverride),
@@ -347,8 +347,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
In16BitMode(TargetTriple.getArch() == Triple::x86 &&
TargetTriple.getEnvironment() == Triple::CODE16),
InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
- FrameLowering(*this, getStackAlignment()), OptForSize(OptForSize),
- OptForMinSize(OptForMinSize) {
+ FrameLowering(*this, getStackAlignment()) {
// Determine the PICStyle based on the target selected.
if (!isPositionIndependent())
setPICStyle(PICStyles::None);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..a9f3a2aee1be 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
/// True if the LEA instruction with certain arguments is slow
bool SlowLEA;
+ /// True if the LEA instruction has all three source operands: base, index,
+ /// and offset or if the LEA instruction uses base and index registers where
+ /// the base is EBP, RBP,or R13
+ bool Slow3OpsLEA;
+
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
@@ -331,16 +336,12 @@ private:
X86TargetLowering TLInfo;
X86FrameLowering FrameLowering;
- bool OptForSize;
- bool OptForMinSize;
-
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
///
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const X86TargetMachine &TM, unsigned StackAlignOverride,
- bool OptForSize, bool OptForMinSize);
+ const X86TargetMachine &TM, unsigned StackAlignOverride);
/// This object will take onwership of \p GISelAccessor.
void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
@@ -490,6 +491,7 @@ public:
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
+ bool slow3OpsLEA() const { return Slow3OpsLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
@@ -507,9 +509,6 @@ public:
bool isSLM() const { return X86ProcFamily == IntelSLM; }
bool useSoftFloat() const { return UseSoftFloat; }
- bool getOptForSize() const { return OptForSize; }
- bool getOptForMinSize() const { return OptForMinSize; }
-
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
/// no-sse2). There isn't any reason to disable it if the target processor
/// supports it.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 9a82e6e50463..53a8e83b36fc 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
void initializeX86ExecutionDepsFixPass(PassRegistry &);
} // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
+ initializeFixupLEAPassPass(PR);
initializeX86ExecutionDepsFixPass(PR);
}
@@ -268,12 +270,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
FS = Key.substr(CPU.size());
- bool OptForSize = F.optForSize();
- bool OptForMinSize = F.optForMinSize();
-
- Key += std::string(OptForSize ? "+" : "-") + "optforsize";
- Key += std::string(OptForMinSize ? "+" : "-") + "optforminsize";
-
auto &I = SubtargetMap[Key];
if (!I) {
// This needs to be done before we create a new subtarget since any
@@ -281,8 +277,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
- Options.StackAlignmentOverride,
- OptForSize, OptForMinSize);
+ Options.StackAlignmentOverride);
#ifndef LLVM_BUILD_GLOBAL_ISEL
GISelAccessor *GISel = new GISelAccessor();
#else
@@ -378,12 +373,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
}
void X86PassConfig::addIRPasses() {
- addPass(createAtomicExpandPass(&getX86TargetMachine()));
+ addPass(createAtomicExpandPass());
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOpt::None)
- addPass(createInterleavedAccessPass(TM));
+ addPass(createInterleavedAccessPass());
}
bool X86PassConfig::addInstSelector() {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 8566bd91c89e..fe94079fd869 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1392,15 +1392,47 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+ static const CostTblEntry AVX512CDCostTbl[] = {
+ { ISD::CTLZ, MVT::v8i64, 1 },
+ { ISD::CTLZ, MVT::v16i32, 1 },
+ { ISD::CTLZ, MVT::v32i16, 8 },
+ { ISD::CTLZ, MVT::v64i8, 20 },
+ { ISD::CTLZ, MVT::v4i64, 1 },
+ { ISD::CTLZ, MVT::v8i32, 1 },
+ { ISD::CTLZ, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v32i8, 10 },
+ { ISD::CTLZ, MVT::v2i64, 1 },
+ { ISD::CTLZ, MVT::v4i32, 1 },
+ { ISD::CTLZ, MVT::v8i16, 4 },
+ { ISD::CTLZ, MVT::v16i8, 4 },
+ };
static const CostTblEntry AVX512BWCostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 5 },
{ ISD::BITREVERSE, MVT::v16i32, 5 },
{ ISD::BITREVERSE, MVT::v32i16, 5 },
{ ISD::BITREVERSE, MVT::v64i8, 5 },
+ { ISD::CTLZ, MVT::v8i64, 23 },
+ { ISD::CTLZ, MVT::v16i32, 22 },
+ { ISD::CTLZ, MVT::v32i16, 18 },
+ { ISD::CTLZ, MVT::v64i8, 17 },
+ { ISD::CTPOP, MVT::v8i64, 7 },
+ { ISD::CTPOP, MVT::v16i32, 11 },
+ { ISD::CTPOP, MVT::v32i16, 9 },
+ { ISD::CTPOP, MVT::v64i8, 6 },
+ { ISD::CTTZ, MVT::v8i64, 10 },
+ { ISD::CTTZ, MVT::v16i32, 14 },
+ { ISD::CTTZ, MVT::v32i16, 12 },
+ { ISD::CTTZ, MVT::v64i8, 9 },
};
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::CTLZ, MVT::v8i64, 29 },
+ { ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTPOP, MVT::v8i64, 16 },
+ { ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTTZ, MVT::v8i64, 20 },
+ { ISD::CTTZ, MVT::v16i32, 28 },
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -1560,6 +1592,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ST->hasCDI())
+ if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
return LT.first * Entry->Cost;