diff options
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/X86.td | 3 | ||||
-rw-r--r-- | lib/Target/X86/X86CallingConv.td | 42 | ||||
-rw-r--r-- | lib/Target/X86/X86FastISel.cpp | 7 | ||||
-rw-r--r-- | lib/Target/X86/X86FixupLEAs.cpp | 269 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 162 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrAVX512.td | 132 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFragmentsSIMD.td | 6 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 13 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrTSX.td | 5 | ||||
-rw-r--r-- | lib/Target/X86/X86InstructionSelector.cpp | 24 | ||||
-rw-r--r-- | lib/Target/X86/X86LegalizerInfo.cpp | 19 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterInfo.td | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.cpp | 7 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.h | 15 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetMachine.cpp | 15 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 36 |
17 files changed, 497 insertions, 266 deletions
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 784c3a6557ff..3a421fe77392 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, + FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 6781d761a1c4..7d146d050a5c 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -73,8 +73,8 @@ def CC_#NAME : CallingConv<[ CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>, CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // Promote v8i1/v16i1/v32i1 arguments to i32. CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>, @@ -146,8 +146,8 @@ def CC_#NAME : CallingConv<[ ]>; def RetCC_#NAME : CallingConv<[ - // Promote i1, v8i1 arguments to i8. - CCIfType<[i1, v8i1], CCPromoteToType<i8>>, + // Promote i1, v1i1, v8i1 arguments to i8. + CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>, // Promote v16i1 arguments to i16. CCIfType<[v16i1], CCPromoteToType<i16>>, @@ -207,6 +207,7 @@ def RetCC_X86Common : CallingConv<[ // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, @@ -375,6 +376,7 @@ def RetCC_X86_64_Swift : CallingConv<[ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, // For integers, ECX, R8D can be used as extra return registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>, @@ -485,8 +487,8 @@ def CC_X86_64_C : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<8, 8>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, @@ -584,8 +586,8 @@ def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. // FIXME: Handle varargs. - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCAssignToReg<[R10]>>, @@ -796,8 +798,8 @@ def CC_X86_32_Common : CallingConv<[ ]>; def CC_X86_32_C : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, @@ -816,8 +818,8 @@ def CC_X86_32_MCU : CallingConv<[ // puts arguments in registers. CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // If the call is not a vararg call, some arguments may be passed // in integer registers. @@ -828,8 +830,8 @@ def CC_X86_32_MCU : CallingConv<[ ]>; def CC_X86_32_FastCall : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, @@ -858,15 +860,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[ ]>; def CC_X86_32_ThisCall_Mingw : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, CCDelegateTo<CC_X86_32_ThisCall_Common> ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // Pass sret arguments indirectly through stack. CCIfSRet<CCAssignToStack<4, 4>>, @@ -885,8 +887,8 @@ def CC_X86_32_FastCC : CallingConv<[ // puts arguments in registers. CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index fc3b4836c178..3cfb924abd01 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3647,13 +3647,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: - if (Subtarget->hasAVX512()) { - // Need to copy to a VK1 register. - unsigned ResultReg = createResultReg(&X86::VK1RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg); - return ResultReg; - } case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 2cd4c1a3e7b3..9f649dad8bc0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,20 +27,26 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -#define DEBUG_TYPE "x86-fixup-LEAs" +namespace llvm { +void initializeFixupLEAPassPass(PassRegistry &); +} + +#define FIXUPLEA_DESC "X86 LEA Fixup" +#define FIXUPLEA_NAME "x86-fixup-LEAs" + +#define DEBUG_TYPE FIXUPLEA_NAME STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; + /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on SNB+ try to replace it with other instructions. + /// According to Intel's Optimization Reference Manual: + /// " For LEA instructions with three source operands and some specific + /// situations, instruction latency has increased to 3 cycles, and must + /// dispatch via port 1: + /// - LEA that has all three source operands: base, index, and offset + /// - LEA that uses base and index registers where the base is EBP, RBP, + /// or R13 + /// - LEA that uses RIP relative addressing mode + /// - LEA that uses 16-bit addressing mode " + /// This function currently handles the first 2 cases only. + MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - FixupLEAPass() : MachineFunctionPass(ID) {} + static char ID; + + StringRef getPassName() const override { return FIXUPLEA_DESC; } + + FixupLEAPass() : MachineFunctionPass(ID) { + initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); + } /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -104,9 +132,12 @@ private: bool OptIncDec; bool OptLEA; }; -char FixupLEAPass::ID = 0; } +char FixupLEAPass::ID = 0; + +INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) + MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { @@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) return false; @@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int opcode) { - return opcode == X86::LEA16r || opcode == X86::LEA32r || - opcode == X86::LEA64r || opcode == X86::LEA64_32r; +static inline bool isLEA(const int Opcode) { + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +static inline bool isInefficientLEAReg(unsigned int Reg) { + return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; +} + +static inline bool isRegOperand(const MachineOperand &Op) { + return Op.isReg() && Op.getReg() != X86::NoRegister; +} +/// hasIneffecientLEARegs - LEA that uses base and index registers +/// where the base is EBP, RBP, or R13 +static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, + const MachineOperand &Index) { + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && + isRegOperand(Index); +} + +static inline bool hasLEAOffset(const MachineOperand &Offset) { + return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); +} + +// LEA instruction that has all three operands: offset, base and index +static inline bool isThreeOperandsLEA(const MachineOperand &Base, + const MachineOperand &Index, + const MachineOperand &Offset) { + return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); +} + +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { + bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; + case X86::LEA32r: + case X86::LEA64_32r: + return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; + case X86::LEA64r: + return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; + } } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int opcode = MI.getOpcode(); - if (!isLEA(opcode)) + const int Opcode = MI.getOpcode(); + if (!isLEA(Opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; - int addrr_opcode, addri_opcode; - switch (opcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - addrr_opcode = X86::ADD16rr; - addri_opcode = X86::ADD16ri; - break; - case X86::LEA32r: - addrr_opcode = X86::ADD32rr; - addri_opcode = X86::ADD32ri; - break; - case X86::LEA64_32r: - case X86::LEA64r: - addrr_opcode = X86::ADD64rr; - addri_opcode = X86::ADD64ri32; - break; - } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; - const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); - const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .add(Dst) - .add(Src1) - .add(Src2); - MFI->insert(I, NewMI); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); + const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = + BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { + const MCInstrDesc &ADDri = + TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .add(Dst) + NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(MI.getOperand(4).getImm()); - MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = static_cast<MachineBasicBlock::iterator>(NewMI); + I = NewMI; + } +} + +MachineInstr * +FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI) { + + const int LEAOpcode = MI.getOpcode(); + if (!isLEA(LEAOpcode)) + return nullptr; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(1); + const MachineOperand &Scale = MI.getOperand(2); + const MachineOperand &Index = MI.getOperand(3); + const MachineOperand &Offset = MI.getOperand(4); + const MachineOperand &Segment = MI.getOperand(5); + + if (!(isThreeOperandsLEA(Base, Index, Offset) || + hasInefficientLEABaseReg(Base, Index)) || + !TII->isSafeToClobberEFLAGS(*MFI, MI) || + Segment.getReg() != X86::NoRegister) + return nullptr; + + unsigned int DstR = Dst.getReg(); + unsigned int BaseR = Base.getReg(); + unsigned int IndexR = Index.getReg(); + unsigned SSDstR = + (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; + bool IsScale1 = Scale.getImm() == 1; + bool IsInefficientBase = isInefficientLEAReg(BaseR); + bool IsInefficientIndex = isInefficientLEAReg(IndexR); + + // Skip these cases since it takes more than 2 instructions + // to replace the LEA instruction. + if (IsInefficientBase && SSDstR == BaseR && !IsScale1) + return nullptr; + if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && + (IsInefficientIndex || !IsScale1)) + return nullptr; + + const DebugLoc DL = MI.getDebugLoc(); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); + const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + + // First try to replace LEA with one or two (for the 3-op LEA case) + // add instructions: + // 1.lea (%base,%index,1), %base => add %index,%base + // 2.lea (%base,%index,1), %index => add %base,%index + if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { + const MachineOperand &Src = DstR == BaseR ? Index : Base; + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // Handle the rest of the cases with inefficient base register: + assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(IsInefficientBase && "efficient base should be handled already!"); + + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst + if (IsScale1 && !hasLEAOffset(Offset)) { + TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); + DEBUG(MI.getPrevNode()->dump();); + + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + DEBUG(NewMI->dump();); + return NewMI; } + // lea offset(%base,%index,scale), %dst => + // lea offset( ,%index,scale), %dst; add %base,%dst + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); + DEBUG(NewMI->dump();); + + NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + DEBUG(NewMI->dump();); + return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + + else { + if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { + MFI->erase(I); + I = NewMI; + } + } else + processInstruction(I, MFI); + } } } return false; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 11c08292518a..37b248416e4a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1140,7 +1140,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); @@ -1155,16 +1155,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCCE, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::XOR, MVT::i1, Legal); - setOperationAction(ISD::OR, MVT::i1, Legal); - setOperationAction(ISD::AND, MVT::i1, Legal); - setOperationAction(ISD::SUB, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i1, Custom); - setOperationAction(ISD::MUL, MVT::i1, Custom); for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, @@ -1233,7 +1223,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1311,7 +1300,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); @@ -1699,7 +1690,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; + return MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); @@ -2480,6 +2471,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SelectionDAG &DAG) { SDValue ValReturned = ValArg; + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); @@ -2502,7 +2496,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } - return DAG.getBitcast(ValVT, ValReturned); } @@ -2809,8 +2802,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) - : Val; + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; } // FIXME: Get this from tablegen. @@ -2960,7 +2956,7 @@ SDValue X86TargetLowering::LowerFormalArguments( RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) + else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; @@ -6871,7 +6867,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); @@ -6914,7 +6910,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(In)) NonConstIdx.push_back(idx); else { - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) @@ -13946,7 +13942,6 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); - assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); @@ -13980,8 +13975,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -13992,7 +13987,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - if (Op.getSimpleValueType() == MVT::i1) + if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa<ConstantSDNode>(Idx)) { @@ -14163,10 +14158,13 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { return EltInVec; } - // Insertion of one bit into first or last position - // can be done with two SHIFTs + OR. + // Insertion of one bit into first position if (IdxVal == 0 ) { - // EltInVec already at correct index and other bits are 0. + // Clean top bits of vector. + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); @@ -14175,6 +14173,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } + // Insertion of one bit into last position if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, @@ -17322,8 +17321,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) - && "SetCC type must be 8-bit or 1-bit integer"); + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -17457,7 +17455,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); @@ -17505,9 +17503,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && - Subtarget.hasAVX512()) - return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; @@ -19048,8 +19047,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is coming as MVT::i8 and it should be truncated -/// to MVT::i1 while lowering masking intrinsics. +/// The mask is coming as MVT::i8 and it should be transformed +/// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. @@ -19064,11 +19063,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND) + Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); @@ -19507,10 +19505,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { @@ -19570,18 +19569,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -19629,13 +19628,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue FCmp; if (isRoundModeCurDirection(Sae)) - FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); - // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" - return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -23385,8 +23384,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); - EVT EltVT = NVT.getVectorElementType(); - SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { @@ -23404,6 +23401,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); + EVT EltVT = InOp.getOperand(0).getValueType(); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) @@ -24709,16 +24708,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, // xbegin sinkMBB // // mainMBB: - // eax = -1 + // s0 = -1 + // + // fallBB: + // eax = # XABORT_DEF + // s1 = eax // // sinkMBB: - // v = eax + // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); + MF->insert(I, fallMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. @@ -24726,25 +24731,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned fallDstReg = MRI.createVirtualRegister(RC); + // thisMBB: - // xbegin sinkMBB + // xbegin fallMBB // # fallthrough to mainMBB - // # abortion to sinkMBB - BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + // # abortion to fallMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); - thisMBB->addSuccessor(sinkMBB); + thisMBB->addSuccessor(fallMBB); // mainMBB: - // EAX = -1 - BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + // mainDstReg := -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); + BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); - // sinkMBB: - // EAX is live into the sinkMBB - sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), - MI.getOperand(0).getReg()) + // fallMBB: + // ; pseudo instruction to model hardware's definition from XABORT + // EAX := XABORT_DEF + // fallDstReg := EAX + BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); + BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); + fallMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; @@ -29574,7 +29594,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getSelect(DL, VT, CondNew, RHS, LHS); @@ -31321,13 +31341,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, - CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); - if (N->getValueType(0) != MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), - FSetCC); - return FSetCC; + SDValue FSetCC = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), + FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 71d395244b4a..f9344413bbcf 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -31,8 +31,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); + ValueType KVT = !cast<ValueType>("v" # NumElts # "i1"); // Suffix used in the instruction mnemonic. string Suffix = suffix; @@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512, NoDQI] in { let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(i1 (load addr:$src)), + def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>; def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), (KMOVWkm addr:$src)>; @@ -2280,77 +2279,45 @@ let Predicates = [HasBWI] in { } let Predicates = [HasAVX512] in { - def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), - (i32 1)), VK1)>; + multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> { + def : Pat<(maskVT (scalar_to_vector GR32:$src)), + (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>; + def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS GR32:$src, VK1)>; + def : Pat<(maskVT (scalar_to_vector GR8:$src)), + (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit), (i32 1)), VK1)>; + def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit), (i32 1)), VK1)>; - - def : Pat<(i32 (zext VK1:$src)), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>; - - def : Pat<(i32 (anyext VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, GR32)>; - - def : Pat<(i8 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>; - - def : Pat<(i8 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; + def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; + } - def : Pat<(i64 (zext VK1:$src)), - (SUBREG_TO_REG (i64 0), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>; + defm : operation_gpr_mask_copy_lowering<VK1, v1i1>; + defm : operation_gpr_mask_copy_lowering<VK2, v2i1>; + defm : operation_gpr_mask_copy_lowering<VK4, v4i1>; + defm : operation_gpr_mask_copy_lowering<VK8, v8i1>; + defm : operation_gpr_mask_copy_lowering<VK16, v16i1>; + defm : operation_gpr_mask_copy_lowering<VK32, v32i1>; + defm : operation_gpr_mask_copy_lowering<VK64, v64i1>; - def : Pat<(i64 (anyext VK1:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>; + def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK1)>; + def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK16)>; + def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK8)>; - def : Pat<(i16 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>; - - def : Pat<(i16 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; -} -def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; -def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -def : Pat<(v4i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK4)>; -def : Pat<(v2i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK2)>; -def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; -def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; - -def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; - -def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>; +} // Mask unary operation // - KNOT @@ -2551,14 +2518,11 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; + def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; - let AddedComplexity = 10 in { // To optimize isel table. - def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - } + def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 @@ -2570,6 +2534,12 @@ multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subV def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), (VT (COPY_TO_REGCLASS subRC:$src, RC))>; } +defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>; defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>; defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>; @@ -3249,7 +3219,7 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode, def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk) @@ -3260,7 +3230,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz) @@ -3279,7 +3249,7 @@ def : Pat<(masked_store addr:$dst, Mask, (iPTR 0))), (iPTR 0)))), (!cast<Instruction>(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3296,7 +3266,7 @@ def : Pat<(masked_store addr:$dst, Mask, (iPTR 0))), (iPTR 0)))), (!cast<Instruction>(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3310,7 +3280,7 @@ def : Pat<(_.info128.VT (extract_subvector (v16i32 immAllZerosV))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3322,7 +3292,7 @@ def : Pat<(_.info128.VT (extract_subvector (iPTR 0))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; } @@ -3338,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector (v16i32 immAllZerosV))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3350,7 +3320,7 @@ def : Pat<(_.info128.VT (extract_subvector (iPTR 0))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; } @@ -3381,7 +3351,7 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), + (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 9867ba84bb9b..e2e228f5544b 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -274,7 +274,7 @@ def X86select : SDNode<"X86ISD::SELECT", SDTCisSameNumEltsAs<0, 1>]>>; def X86selects : SDNode<"X86ISD::SELECTS", - SDTypeProfile<1, 3, [SDTCisVT<1, i1>, + SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>>; @@ -441,7 +441,7 @@ def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", SDTCisSameNumEltsAs<0,1>, SDTCisVT<2, i32>]>, []>; def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", - SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", @@ -451,7 +451,7 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 092ceb207ada..f7083a7448ce 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -10511,9 +10511,7 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF, - bool IsTailCall) const { - return; -} + bool IsTailCall) const {} MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 4d7d8ece92d9..01df07e1715f 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -896,9 +896,16 @@ def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; -def OptForSize : Predicate<"Subtarget->getOptForSize()">; -def OptForMinSize : Predicate<"Subtarget->getOptForMinSize()">; -def OptForSpeed : Predicate<"!Subtarget->getOptForSize()">; + +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def OptForSize : Predicate<"MF->getFunction()->optForSize()">; + def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction()->optForSize()">; +} + def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 38ac8be94483..61aac58a491f 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -30,6 +30,11 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), "xbegin\t$dst", []>, OpSize32; } +// Psuedo instruction to fake the definition of EAX on the fallback code path. +let isPseudo = 1, Defs = [EAX] in { +def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>; +} + def XEND : I<0x01, MRM_D5, (outs), (ins), "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 5eb5ad52840a..61956f741820 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -449,24 +449,30 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, if (!SrcRC) return false; - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); - return false; - } - + unsigned SubIdx; if (DstRC == SrcRC) { // Nothing to be done + SubIdx = X86::NoSubRegister; } else if (DstRC == &X86::GR32RegClass) { - I.getOperand(1).setSubReg(X86::sub_32bit); + SubIdx = X86::sub_32bit; } else if (DstRC == &X86::GR16RegClass) { - I.getOperand(1).setSubReg(X86::sub_16bit); + SubIdx = X86::sub_16bit; } else if (DstRC == &X86::GR8RegClass) { - I.getOperand(1).setSubReg(X86::sub_8bit); + SubIdx = X86::sub_8bit; } else { return false; } + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx); + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + I.getOperand(1).setSubReg(SubIdx); + I.setDesc(TII.get(X86::COPY)); return true; } diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 8ce240714f17..da724f5d8989 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -184,6 +184,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { return; const LLT s64 = LLT::scalar(64); + const LLT v16s8 = LLT::vector(16, 8); const LLT v8s16 = LLT::vector(8, 16); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); @@ -193,7 +194,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { setAction({BinOp, Ty}, Legal); for (unsigned BinOp : {G_ADD, G_SUB}) - for (auto Ty : {v4s32}) + for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) setAction({BinOp, Ty}, Legal); setAction({G_MUL, v8s16}, Legal); @@ -212,8 +213,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() { if (!Subtarget.hasAVX2()) return; + const LLT v32s8 = LLT::vector(32, 8); const LLT v16s16 = LLT::vector(16, 16); const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) + setAction({BinOp, Ty}, Legal); for (auto Ty : {v16s16, v8s32}) setAction({G_MUL, Ty}, Legal); @@ -224,6 +231,11 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { return; const LLT v16s32 = LLT::vector(16, 32); + const LLT v8s64 = LLT::vector(8, 64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v16s32, v8s64}) + setAction({BinOp, Ty}, Legal); setAction({G_MUL, v16s32}, Legal); @@ -261,8 +273,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX512BW() { if (!(Subtarget.hasAVX512() && Subtarget.hasBWI())) return; + const LLT v64s8 = LLT::vector(64, 8); const LLT v32s16 = LLT::vector(32, 16); + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v64s8, v32s16}) + setAction({BinOp, Ty}, Legal); + setAction({G_MUL, v32s16}, Legal); /************ VLX *******************/ diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index d235d2b40b15..3a61a7247c72 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -511,7 +511,7 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} @@ -519,7 +519,7 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index d66d39dcee17..2b1f43bffd71 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -320,6 +320,7 @@ void X86Subtarget::initializeEnvironment() { CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; + Slow3OpsLEA = false; SlowIncDec = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? @@ -336,8 +337,7 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, - unsigned StackAlignOverride, bool OptForSize, - bool OptForMinSize) + unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), @@ -347,8 +347,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - FrameLowering(*this, getStackAlignment()), OptForSize(OptForSize), - OptForMinSize(OptForMinSize) { + FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) setPICStyle(PICStyles::None); diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index de1514243aeb..a9f3a2aee1be 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,6 +253,11 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// True if the LEA instruction has all three source operands: base, index, + /// and offset or if the LEA instruction uses base and index registers where + /// the base is EBP, RBP,or R13 + bool Slow3OpsLEA; + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -331,16 +336,12 @@ private: X86TargetLowering TLInfo; X86FrameLowering FrameLowering; - bool OptForSize; - bool OptForMinSize; - public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const X86TargetMachine &TM, unsigned StackAlignOverride, - bool OptForSize, bool OptForMinSize); + const X86TargetMachine &TM, unsigned StackAlignOverride); /// This object will take onwership of \p GISelAccessor. void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); } @@ -490,6 +491,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } @@ -507,9 +509,6 @@ public: bool isSLM() const { return X86ProcFamily == IntelSLM; } bool useSoftFloat() const { return UseSoftFloat; } - bool getOptForSize() const { return OptForSize; } - bool getOptForMinSize() const { return OptForMinSize; } - /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for /// no-sse2). There isn't any reason to disable it if the target processor /// supports it. diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 9a82e6e50463..53a8e83b36fc 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); +void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } @@ -268,12 +270,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { FS = Key.substr(CPU.size()); - bool OptForSize = F.optForSize(); - bool OptForMinSize = F.optForMinSize(); - - Key += std::string(OptForSize ? "+" : "-") + "optforsize"; - Key += std::string(OptForMinSize ? "+" : "-") + "optforminsize"; - auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -281,8 +277,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, - Options.StackAlignmentOverride, - OptForSize, OptForMinSize); + Options.StackAlignmentOverride); #ifndef LLVM_BUILD_GLOBAL_ISEL GISelAccessor *GISel = new GISelAccessor(); #else @@ -378,12 +373,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { } void X86PassConfig::addIRPasses() { - addPass(createAtomicExpandPass(&getX86TargetMachine())); + addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createInterleavedAccessPass(TM)); + addPass(createInterleavedAccessPass()); } bool X86PassConfig::addInstSelector() { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 8566bd91c89e..fe94079fd869 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1392,15 +1392,47 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll + static const CostTblEntry AVX512CDCostTbl[] = { + { ISD::CTLZ, MVT::v8i64, 1 }, + { ISD::CTLZ, MVT::v16i32, 1 }, + { ISD::CTLZ, MVT::v32i16, 8 }, + { ISD::CTLZ, MVT::v64i8, 20 }, + { ISD::CTLZ, MVT::v4i64, 1 }, + { ISD::CTLZ, MVT::v8i32, 1 }, + { ISD::CTLZ, MVT::v16i16, 4 }, + { ISD::CTLZ, MVT::v32i8, 10 }, + { ISD::CTLZ, MVT::v2i64, 1 }, + { ISD::CTLZ, MVT::v4i32, 1 }, + { ISD::CTLZ, MVT::v8i16, 4 }, + { ISD::CTLZ, MVT::v16i8, 4 }, + }; static const CostTblEntry AVX512BWCostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 5 }, { ISD::BITREVERSE, MVT::v16i32, 5 }, { ISD::BITREVERSE, MVT::v32i16, 5 }, { ISD::BITREVERSE, MVT::v64i8, 5 }, + { ISD::CTLZ, MVT::v8i64, 23 }, + { ISD::CTLZ, MVT::v16i32, 22 }, + { ISD::CTLZ, MVT::v32i16, 18 }, + { ISD::CTLZ, MVT::v64i8, 17 }, + { ISD::CTPOP, MVT::v8i64, 7 }, + { ISD::CTPOP, MVT::v16i32, 11 }, + { ISD::CTPOP, MVT::v32i16, 9 }, + { ISD::CTPOP, MVT::v64i8, 6 }, + { ISD::CTTZ, MVT::v8i64, 10 }, + { ISD::CTTZ, MVT::v16i32, 14 }, + { ISD::CTTZ, MVT::v32i16, 12 }, + { ISD::CTTZ, MVT::v64i8, 9 }, }; static const CostTblEntry AVX512CostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, + { ISD::CTLZ, MVT::v8i64, 29 }, + { ISD::CTLZ, MVT::v16i32, 35 }, + { ISD::CTPOP, MVT::v8i64, 16 }, + { ISD::CTPOP, MVT::v16i32, 24 }, + { ISD::CTTZ, MVT::v8i64, 20 }, + { ISD::CTTZ, MVT::v16i32, 28 }, }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, @@ -1560,6 +1592,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, MVT MTy = LT.second; // Attempt to lookup cost. + if (ST->hasCDI()) + if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) return LT.first * Entry->Cost; |