From 145449b1e420787bb99721a429341fa6be3adfb6 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 3 Jul 2022 16:10:23 +0200 Subject: Vendor import of llvm-project main llvmorg-15-init-15358-g53dc0f107877. --- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 513 ++-- llvm/lib/Target/X86/AsmParser/X86Operand.h | 36 +- .../Target/X86/Disassembler/X86Disassembler.cpp | 77 +- llvm/lib/Target/X86/MCA/X86CustomBehaviour.h | 2 +- .../Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp | 8 +- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 105 +- .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 22 +- .../Target/X86/MCTargetDesc/X86InstPrinterCommon.h | 3 +- .../X86/MCTargetDesc/X86InstrRelaxTables.cpp | 165 + .../Target/X86/MCTargetDesc/X86InstrRelaxTables.h | 54 + .../X86/MCTargetDesc/X86IntelInstPrinter.cpp | 4 +- .../Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 134 +- llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h | 1 + .../Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 91 + llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h | 26 +- .../Target/X86/MCTargetDesc/X86MnemonicTables.cpp | 16 + .../Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp | 21 +- .../X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp | 1 + llvm/lib/Target/X86/X86.h | 4 + llvm/lib/Target/X86/X86.td | 279 +- llvm/lib/Target/X86/X86AsmPrinter.cpp | 96 +- llvm/lib/Target/X86/X86AsmPrinter.h | 5 +- llvm/lib/Target/X86/X86AvoidTrailingCall.cpp | 7 +- llvm/lib/Target/X86/X86CallingConv.cpp | 2 +- llvm/lib/Target/X86/X86CmovConversion.cpp | 27 +- llvm/lib/Target/X86/X86DiscriminateMemOps.cpp | 3 +- llvm/lib/Target/X86/X86DomainReassignment.cpp | 14 +- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 11 +- llvm/lib/Target/X86/X86FastISel.cpp | 133 +- llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 709 +++++ llvm/lib/Target/X86/X86FastTileConfig.cpp | 293 +- llvm/lib/Target/X86/X86FixupLEAs.cpp | 3 +- llvm/lib/Target/X86/X86FloatingPoint.cpp | 26 +- llvm/lib/Target/X86/X86FrameLowering.cpp | 136 +- llvm/lib/Target/X86/X86FrameLowering.h | 7 +- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 282 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 3225 +++++++++++++------- llvm/lib/Target/X86/X86ISelLowering.h | 58 +- llvm/lib/Target/X86/X86IndirectThunks.cpp | 1 + llvm/lib/Target/X86/X86InsertPrefetch.cpp | 1 + llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp | 49 +- llvm/lib/Target/X86/X86InstrAMX.td | 18 +- llvm/lib/Target/X86/X86InstrAVX512.td | 131 +- llvm/lib/Target/X86/X86InstrArithmetic.td | 8 +- llvm/lib/Target/X86/X86InstrCMovSetCC.td | 8 +- llvm/lib/Target/X86/X86InstrCompiler.td | 85 +- llvm/lib/Target/X86/X86InstrControl.td | 4 +- llvm/lib/Target/X86/X86InstrFPStack.td | 22 +- llvm/lib/Target/X86/X86InstrFoldTables.cpp | 4 +- llvm/lib/Target/X86/X86InstrFormats.td | 6 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 1 - llvm/lib/Target/X86/X86InstrInfo.cpp | 851 ++++-- llvm/lib/Target/X86/X86InstrInfo.h | 18 +- llvm/lib/Target/X86/X86InstrInfo.td | 111 +- llvm/lib/Target/X86/X86InstrMMX.td | 4 +- llvm/lib/Target/X86/X86InstrSSE.td | 68 +- llvm/lib/Target/X86/X86InstrSystem.td | 16 +- llvm/lib/Target/X86/X86InstrTSX.td | 2 + llvm/lib/Target/X86/X86InstrVecCompiler.td | 6 +- llvm/lib/Target/X86/X86InstrXOP.td | 4 +- llvm/lib/Target/X86/X86InstructionSelector.cpp | 16 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 12 +- .../X86/X86LoadValueInjectionLoadHardening.cpp | 3 +- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 181 +- llvm/lib/Target/X86/X86MCInstLower.cpp | 41 +- llvm/lib/Target/X86/X86MachineFunctionInfo.cpp | 7 + llvm/lib/Target/X86/X86MachineFunctionInfo.h | 10 +- llvm/lib/Target/X86/X86MacroFusion.cpp | 1 + llvm/lib/Target/X86/X86PadShortFunction.cpp | 11 +- llvm/lib/Target/X86/X86PartialReduction.cpp | 35 +- llvm/lib/Target/X86/X86PreAMXConfig.cpp | 56 +- llvm/lib/Target/X86/X86PreTileConfig.cpp | 53 +- llvm/lib/Target/X86/X86RegisterBankInfo.cpp | 7 +- llvm/lib/Target/X86/X86RegisterBankInfo.h | 2 +- llvm/lib/Target/X86/X86RegisterInfo.cpp | 62 + llvm/lib/Target/X86/X86RegisterInfo.h | 12 + llvm/lib/Target/X86/X86RegisterInfo.td | 15 +- llvm/lib/Target/X86/X86SchedBroadwell.td | 20 +- llvm/lib/Target/X86/X86SchedHaswell.td | 20 +- llvm/lib/Target/X86/X86SchedIceLake.td | 20 +- llvm/lib/Target/X86/X86SchedSandyBridge.td | 40 +- llvm/lib/Target/X86/X86SchedSkylakeClient.td | 26 +- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 32 +- llvm/lib/Target/X86/X86ScheduleBtVer2.td | 4 +- llvm/lib/Target/X86/X86ScheduleSLM.td | 6 +- llvm/lib/Target/X86/X86ScheduleZnver1.td | 106 +- llvm/lib/Target/X86/X86ScheduleZnver2.td | 86 +- llvm/lib/Target/X86/X86SelectionDAGInfo.cpp | 39 +- llvm/lib/Target/X86/X86SelectionDAGInfo.h | 2 +- .../lib/Target/X86/X86SpeculativeLoadHardening.cpp | 31 +- llvm/lib/Target/X86/X86Subtarget.cpp | 12 +- llvm/lib/Target/X86/X86Subtarget.h | 629 +--- llvm/lib/Target/X86/X86TargetMachine.cpp | 51 +- llvm/lib/Target/X86/X86TargetMachine.h | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 290 +- llvm/lib/Target/X86/X86TargetTransformInfo.h | 21 +- llvm/lib/Target/X86/X86TileConfig.cpp | 15 +- 98 files changed, 6078 insertions(+), 3916 deletions(-) create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h create mode 100644 llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp create mode 100644 llvm/lib/Target/X86/X86FastPreTileConfig.cpp (limited to 'llvm/lib/Target/X86') diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index e9ecff3bf514..871b23f80efe 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -9,6 +9,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86IntelInstPrinter.h" #include "MCTargetDesc/X86MCExpr.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86TargetStreamer.h" #include "TargetInfo/X86TargetInfo.h" #include "X86AsmParserCommon.h" @@ -124,12 +125,12 @@ private: bool matchingInlineAsm, unsigned VariantID = 0) { // In Code16GCC mode, match as 32-bit. if (Code16GCC) - SwitchMode(X86::Mode32Bit); + SwitchMode(X86::Is32Bit); unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures, matchingInlineAsm, VariantID); if (Code16GCC) - SwitchMode(X86::Mode16Bit); + SwitchMode(X86::Is16Bit); return rv; } @@ -422,16 +423,18 @@ private: }; class IntelExprStateMachine { - IntelExprState State, PrevState; - unsigned BaseReg, IndexReg, TmpReg, Scale; - int64_t Imm; - const MCExpr *Sym; + IntelExprState State = IES_INIT, PrevState = IES_ERROR; + unsigned BaseReg = 0, IndexReg = 0, TmpReg = 0, Scale = 0; + int64_t Imm = 0; + const MCExpr *Sym = nullptr; StringRef SymName; InfixCalculator IC; InlineAsmIdentifierInfo Info; - short BracCount; - bool MemExpr; - bool OffsetOperator; + short BracCount = 0; + bool MemExpr = false; + bool OffsetOperator = false; + bool AttachToOperandIdx = false; + bool IsPIC = false; SMLoc OffsetOperatorLoc; AsmTypeInfo CurType; @@ -446,10 +449,7 @@ private: } public: - IntelExprStateMachine() - : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), - TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0), - MemExpr(false), OffsetOperator(false) {} + IntelExprStateMachine() = default; void addImm(int64_t imm) { Imm += imm; } short getBracCount() const { return BracCount; } @@ -469,9 +469,29 @@ private: bool isValidEndState() const { return State == IES_RBRAC || State == IES_INTEGER; } + + // Is the intel expression appended after an operand index. + // [OperandIdx][Intel Expression] + // This is neccessary for checking if it is an independent + // intel expression at back end when parse inline asm. + void setAppendAfterOperand() { AttachToOperandIdx = true; } + + bool isPIC() const { return IsPIC; } + void setPIC() { IsPIC = true; } + bool hadError() const { return State == IES_ERROR; } const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; } + bool regsUseUpError(StringRef &ErrMsg) { + // This case mostly happen in inline asm, e.g. Arr[BaseReg + IndexReg] + // can not intruduce additional register in inline asm in PIC model. + if (IsPIC && AttachToOperandIdx) + ErrMsg = "Don't use 2 or more regs for mem offset in PIC model!"; + else + ErrMsg = "BaseReg/IndexReg already set!"; + return true; + } + void onOr() { IntelExprState CurrState = State; switch (State) { @@ -655,10 +675,8 @@ private: if (!BaseReg) { BaseReg = TmpReg; } else { - if (IndexReg) { - ErrMsg = "BaseReg/IndexReg already set!"; - return true; - } + if (IndexReg) + return regsUseUpError(ErrMsg); IndexReg = TmpReg; Scale = 0; } @@ -716,10 +734,8 @@ private: if (!BaseReg) { BaseReg = TmpReg; } else { - if (IndexReg) { - ErrMsg = "BaseReg/IndexReg already set!"; - return true; - } + if (IndexReg) + return regsUseUpError(ErrMsg); IndexReg = TmpReg; Scale = 0; } @@ -777,10 +793,8 @@ private: case IES_MULTIPLY: // Index Register - Scale * Register if (PrevState == IES_INTEGER) { - if (IndexReg) { - ErrMsg = "BaseReg/IndexReg already set!"; - return true; - } + if (IndexReg) + return regsUseUpError(ErrMsg); State = IES_REGISTER; IndexReg = Reg; // Get the scale and replace the 'Scale * Register' with '0'. @@ -861,10 +875,8 @@ private: State = IES_INTEGER; if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { // Index Register - Register * Scale - if (IndexReg) { - ErrMsg = "BaseReg/IndexReg already set!"; - return true; - } + if (IndexReg) + return regsUseUpError(ErrMsg); IndexReg = TmpReg; Scale = TmpInt; if (checkScale(Scale, ErrMsg)) @@ -945,7 +957,7 @@ private: BracCount++; return false; } - bool onRBrac() { + bool onRBrac(StringRef &ErrMsg) { IntelExprState CurrState = State; switch (State) { default: @@ -955,8 +967,10 @@ private: case IES_OFFSET: case IES_REGISTER: case IES_RPAREN: - if (BracCount-- != 1) + if (BracCount-- != 1) { + ErrMsg = "unexpected bracket encountered"; return true; + } State = IES_RBRAC; if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { // If we already have a BaseReg, then assume this is the IndexReg with @@ -964,7 +978,8 @@ private: if (!BaseReg) { BaseReg = TmpReg; } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); + if (IndexReg) + return regsUseUpError(ErrMsg); IndexReg = TmpReg; Scale = 0; } @@ -1089,9 +1104,9 @@ private: std::unique_ptr &&Dst); bool VerifyAndAdjustOperands(OperandVector &OrigOperands, OperandVector &FinalOperands); - bool ParseOperand(OperandVector &Operands); - bool ParseATTOperand(OperandVector &Operands); - bool ParseIntelOperand(OperandVector &Operands); + bool parseOperand(OperandVector &Operands, StringRef Name); + bool parseATTOperand(OperandVector &Operands); + bool parseIntelOperand(OperandVector &Operands, StringRef Name); bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, InlineAsmIdentifierInfo &Info, SMLoc &End); bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); @@ -1111,6 +1126,8 @@ private: InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator = false); + void tryParseOperandIdx(AsmToken::TokenKind PrevTK, + IntelExprStateMachine &SM); bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, OperandVector &Operands); @@ -1193,19 +1210,19 @@ private: bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? - return getSTI().getFeatureBits()[X86::Mode64Bit]; + return getSTI().getFeatureBits()[X86::Is64Bit]; } bool is32BitMode() const { // FIXME: Can tablegen auto-generate this? - return getSTI().getFeatureBits()[X86::Mode32Bit]; + return getSTI().getFeatureBits()[X86::Is32Bit]; } bool is16BitMode() const { // FIXME: Can tablegen auto-generate this? - return getSTI().getFeatureBits()[X86::Mode16Bit]; + return getSTI().getFeatureBits()[X86::Is16Bit]; } void SwitchMode(unsigned mode) { MCSubtargetInfo &STI = copySTI(); - FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); + FeatureBitset AllModes({X86::Is64Bit, X86::Is32Bit, X86::Is16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; FeatureBitset FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); @@ -1716,11 +1733,11 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands, return false; } -bool X86AsmParser::ParseOperand(OperandVector &Operands) { +bool X86AsmParser::parseOperand(OperandVector &Operands, StringRef Name) { if (isParsingIntelSyntax()) - return ParseIntelOperand(Operands); + return parseIntelOperand(Operands, Name); - return ParseATTOperand(Operands); + return parseATTOperand(Operands); } bool X86AsmParser::CreateMemForMSInlineAsm( @@ -1759,8 +1776,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm( // registers in a mmory expression, and though unaccessible via rip/eip. if (IsGlobalLV && (BaseReg || IndexReg)) { Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start, - End, Size, Identifier, Decl, - FrontendSize)); + End, Size, Identifier, Decl, 0, + BaseReg && IndexReg)); return false; } // Otherwise, we set the base register to a non-zero value @@ -1841,11 +1858,25 @@ bool X86AsmParser::ParseMasmNamedOperator(StringRef Name, return true; } +// Check if current intel expression append after an operand. +// Like: [Operand][Intel Expression] +void X86AsmParser::tryParseOperandIdx(AsmToken::TokenKind PrevTK, + IntelExprStateMachine &SM) { + if (PrevTK != AsmToken::RBrac) + return; + + SM.setAppendAfterOperand(); +} + bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); StringRef ErrMsg; AsmToken::TokenKind PrevTK = AsmToken::Error; + + if (getContext().getObjectFileInfo()->isPositionIndependent()) + SM.setPIC(); + bool Done = false; while (!Done) { // Get a fresh reference on each loop iteration in case the previous @@ -2123,10 +2154,12 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::LBrac: if (SM.onLBrac()) return Error(Tok.getLoc(), "unexpected bracket encountered"); + tryParseOperandIdx(PrevTK, SM); break; case AsmToken::RBrac: - if (SM.onRBrac()) - return Error(Tok.getLoc(), "unexpected bracket encountered"); + if (SM.onRBrac(ErrMsg)) { + return Error(Tok.getLoc(), ErrMsg); + } break; case AsmToken::LParen: SM.onLParen(); break; case AsmToken::RParen: SM.onRParen(); break; @@ -2477,7 +2510,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { return false; } -bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { +bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -2552,6 +2585,8 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { StringRef ErrMsg; unsigned BaseReg = SM.getBaseReg(); unsigned IndexReg = SM.getIndexReg(); + if (IndexReg && BaseReg == X86::RIP) + BaseReg = 0; unsigned Scale = SM.getScale(); if (!PtrInOperand) Size = SM.getElementSize() << 3; @@ -2597,25 +2632,49 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { // When parsing x64 MS-style assembly, all non-absolute references to a named // variable default to RIP-relative. - if (Parser.isParsingMasm() && is64BitMode() && SM.getElementSize() > 0) { - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, - BaseReg, IndexReg, Scale, Start, - End, Size, - /*DefaultBaseReg=*/X86::RIP)); - return false; + unsigned DefaultBaseReg = X86::NoRegister; + bool MaybeDirectBranchDest = true; + + if (Parser.isParsingMasm()) { + bool IsUnconditionalBranch = + Name.equals_insensitive("jmp") || Name.equals_insensitive("call"); + if (is64BitMode() && SM.getElementSize() > 0) { + DefaultBaseReg = X86::RIP; + } + if (IsUnconditionalBranch) { + if (PtrInOperand) { + MaybeDirectBranchDest = false; + if (is64BitMode()) + DefaultBaseReg = X86::RIP; + } else if (!BaseReg && !IndexReg && Disp && + Disp->getKind() == MCExpr::SymbolRef) { + if (is64BitMode()) { + if (SM.getSize() == 8) { + MaybeDirectBranchDest = false; + DefaultBaseReg = X86::RIP; + } + } else { + if (SM.getSize() == 4 || SM.getSize() == 2) + MaybeDirectBranchDest = false; + } + } + } } - if ((BaseReg || IndexReg || RegNo)) - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, - BaseReg, IndexReg, Scale, Start, - End, Size)); + if ((BaseReg || IndexReg || RegNo || DefaultBaseReg != X86::NoRegister)) + Operands.push_back(X86Operand::CreateMem( + getPointerWidth(), RegNo, Disp, BaseReg, IndexReg, Scale, Start, End, + Size, DefaultBaseReg, /*SymName=*/StringRef(), /*OpDecl=*/nullptr, + /*FrontendSize=*/0, /*UseUpRegs=*/false, MaybeDirectBranchDest)); else - Operands.push_back( - X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size)); + Operands.push_back(X86Operand::CreateMem( + getPointerWidth(), Disp, Start, End, Size, /*SymName=*/StringRef(), + /*OpDecl=*/nullptr, /*FrontendSize=*/0, /*UseUpRegs=*/false, + MaybeDirectBranchDest)); return false; } -bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { +bool X86AsmParser::parseATTOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); switch (getLexer().getKind()) { case AsmToken::Dollar: { @@ -2722,7 +2781,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr &Z, if (!getLexer().is(AsmToken::RCurly)) return Error(getLexer().getLoc(), "Expected } at this point"); Parser.Lex(); // Eat '}' - // Assign Z with the {z} mark opernad + // Assign Z with the {z} mark operand Z = X86Operand::CreateToken("{z}", StartLoc); return false; } @@ -3346,7 +3405,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Name = Next; PatchedName = Name; - ForcedDataPrefix = X86::Mode32Bit; + ForcedDataPrefix = X86::Is32Bit; IsPrefix = false; } } @@ -3371,7 +3430,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Read the operands. while (true) { - if (ParseOperand(Operands)) + if (parseOperand(Operands, Name)) return true; if (HandleAVX512Operand(Operands)) return true; @@ -3774,84 +3833,27 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { } bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { + using namespace X86; const MCRegisterInfo *MRI = getContext().getRegisterInfo(); - - switch (Inst.getOpcode()) { - case X86::VGATHERDPDYrm: - case X86::VGATHERDPDrm: - case X86::VGATHERDPSYrm: - case X86::VGATHERDPSrm: - case X86::VGATHERQPDYrm: - case X86::VGATHERQPDrm: - case X86::VGATHERQPSYrm: - case X86::VGATHERQPSrm: - case X86::VPGATHERDDYrm: - case X86::VPGATHERDDrm: - case X86::VPGATHERDQYrm: - case X86::VPGATHERDQrm: - case X86::VPGATHERQDYrm: - case X86::VPGATHERQDrm: - case X86::VPGATHERQQYrm: - case X86::VPGATHERQQrm: { - unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); - unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg()); - unsigned Index = - MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg()); - if (Dest == Mask || Dest == Index || Mask == Index) - return Warning(Ops[0]->getStartLoc(), "mask, index, and destination " - "registers should be distinct"); - break; - } - case X86::VGATHERDPDZ128rm: - case X86::VGATHERDPDZ256rm: - case X86::VGATHERDPDZrm: - case X86::VGATHERDPSZ128rm: - case X86::VGATHERDPSZ256rm: - case X86::VGATHERDPSZrm: - case X86::VGATHERQPDZ128rm: - case X86::VGATHERQPDZ256rm: - case X86::VGATHERQPDZrm: - case X86::VGATHERQPSZ128rm: - case X86::VGATHERQPSZ256rm: - case X86::VGATHERQPSZrm: - case X86::VPGATHERDDZ128rm: - case X86::VPGATHERDDZ256rm: - case X86::VPGATHERDDZrm: - case X86::VPGATHERDQZ128rm: - case X86::VPGATHERDQZ256rm: - case X86::VPGATHERDQZrm: - case X86::VPGATHERQDZ128rm: - case X86::VPGATHERQDZ256rm: - case X86::VPGATHERQDZrm: - case X86::VPGATHERQQZ128rm: - case X86::VPGATHERQQZ256rm: - case X86::VPGATHERQQZrm: { - unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); - unsigned Index = - MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg()); - if (Dest == Index) - return Warning(Ops[0]->getStartLoc(), "index and destination registers " - "should be distinct"); - break; - } - case X86::V4FMADDPSrm: - case X86::V4FMADDPSrmk: - case X86::V4FMADDPSrmkz: - case X86::V4FMADDSSrm: - case X86::V4FMADDSSrmk: - case X86::V4FMADDSSrmkz: - case X86::V4FNMADDPSrm: - case X86::V4FNMADDPSrmk: - case X86::V4FNMADDPSrmkz: - case X86::V4FNMADDSSrm: - case X86::V4FNMADDSSrmk: - case X86::V4FNMADDSSrmkz: - case X86::VP4DPWSSDSrm: - case X86::VP4DPWSSDSrmk: - case X86::VP4DPWSSDSrmkz: - case X86::VP4DPWSSDrm: - case X86::VP4DPWSSDrmk: - case X86::VP4DPWSSDrmkz: { + unsigned Opcode = Inst.getOpcode(); + uint64_t TSFlags = MII.get(Opcode).TSFlags; + if (isVFCMADDCPH(Opcode) || isVFCMADDCSH(Opcode) || isVFMADDCPH(Opcode) || + isVFMADDCSH(Opcode)) { + unsigned Dest = Inst.getOperand(0).getReg(); + for (unsigned i = 2; i < Inst.getNumOperands(); i++) + if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) + return Warning(Ops[0]->getStartLoc(), "Destination register should be " + "distinct from source registers"); + } else if (isVFCMULCPH(Opcode) || isVFCMULCSH(Opcode) || isVFMULCPH(Opcode) || + isVFMULCSH(Opcode)) { + unsigned Dest = Inst.getOperand(0).getReg(); + for (unsigned i = 1; i < Inst.getNumOperands(); i++) + if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) + return Warning(Ops[0]->getStartLoc(), "Destination register should be " + "distinct from source registers"); + } else if (isV4FMADDPS(Opcode) || isV4FMADDSS(Opcode) || + isV4FNMADDPS(Opcode) || isV4FNMADDSS(Opcode) || + isVP4DPWSSDS(Opcode) || isVP4DPWSSD(Opcode)) { unsigned Src2 = Inst.getOperand(Inst.getNumOperands() - X86::AddrNumOperands - 1).getReg(); unsigned Src2Enc = MRI->getEncodingValue(Src2); @@ -3865,186 +3867,34 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { RegName.take_front(3) + Twine(GroupEnd) + "' source group"); } - break; - } - case X86::VFCMADDCPHZ128m: - case X86::VFCMADDCPHZ256m: - case X86::VFCMADDCPHZm: - case X86::VFCMADDCPHZ128mb: - case X86::VFCMADDCPHZ256mb: - case X86::VFCMADDCPHZmb: - case X86::VFCMADDCPHZ128mbk: - case X86::VFCMADDCPHZ256mbk: - case X86::VFCMADDCPHZmbk: - case X86::VFCMADDCPHZ128mbkz: - case X86::VFCMADDCPHZ256mbkz: - case X86::VFCMADDCPHZmbkz: - case X86::VFCMADDCPHZ128mk: - case X86::VFCMADDCPHZ256mk: - case X86::VFCMADDCPHZmk: - case X86::VFCMADDCPHZ128mkz: - case X86::VFCMADDCPHZ256mkz: - case X86::VFCMADDCPHZmkz: - case X86::VFCMADDCPHZ128r: - case X86::VFCMADDCPHZ256r: - case X86::VFCMADDCPHZr: - case X86::VFCMADDCPHZ128rk: - case X86::VFCMADDCPHZ256rk: - case X86::VFCMADDCPHZrk: - case X86::VFCMADDCPHZ128rkz: - case X86::VFCMADDCPHZ256rkz: - case X86::VFCMADDCPHZrkz: - case X86::VFCMADDCPHZrb: - case X86::VFCMADDCPHZrbk: - case X86::VFCMADDCPHZrbkz: - case X86::VFCMADDCSHZm: - case X86::VFCMADDCSHZmk: - case X86::VFCMADDCSHZmkz: - case X86::VFCMADDCSHZr: - case X86::VFCMADDCSHZrb: - case X86::VFCMADDCSHZrbk: - case X86::VFCMADDCSHZrbkz: - case X86::VFCMADDCSHZrk: - case X86::VFCMADDCSHZrkz: - case X86::VFMADDCPHZ128m: - case X86::VFMADDCPHZ256m: - case X86::VFMADDCPHZm: - case X86::VFMADDCPHZ128mb: - case X86::VFMADDCPHZ256mb: - case X86::VFMADDCPHZmb: - case X86::VFMADDCPHZ128mbk: - case X86::VFMADDCPHZ256mbk: - case X86::VFMADDCPHZmbk: - case X86::VFMADDCPHZ128mbkz: - case X86::VFMADDCPHZ256mbkz: - case X86::VFMADDCPHZmbkz: - case X86::VFMADDCPHZ128mk: - case X86::VFMADDCPHZ256mk: - case X86::VFMADDCPHZmk: - case X86::VFMADDCPHZ128mkz: - case X86::VFMADDCPHZ256mkz: - case X86::VFMADDCPHZmkz: - case X86::VFMADDCPHZ128r: - case X86::VFMADDCPHZ256r: - case X86::VFMADDCPHZr: - case X86::VFMADDCPHZ128rk: - case X86::VFMADDCPHZ256rk: - case X86::VFMADDCPHZrk: - case X86::VFMADDCPHZ128rkz: - case X86::VFMADDCPHZ256rkz: - case X86::VFMADDCPHZrkz: - case X86::VFMADDCPHZrb: - case X86::VFMADDCPHZrbk: - case X86::VFMADDCPHZrbkz: - case X86::VFMADDCSHZm: - case X86::VFMADDCSHZmk: - case X86::VFMADDCSHZmkz: - case X86::VFMADDCSHZr: - case X86::VFMADDCSHZrb: - case X86::VFMADDCSHZrbk: - case X86::VFMADDCSHZrbkz: - case X86::VFMADDCSHZrk: - case X86::VFMADDCSHZrkz: { - unsigned Dest = Inst.getOperand(0).getReg(); - for (unsigned i = 2; i < Inst.getNumOperands(); i++) - if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) - return Warning(Ops[0]->getStartLoc(), "Destination register should be " - "distinct from source registers"); - break; - } - case X86::VFCMULCPHZ128rm: - case X86::VFCMULCPHZ256rm: - case X86::VFCMULCPHZrm: - case X86::VFCMULCPHZ128rmb: - case X86::VFCMULCPHZ256rmb: - case X86::VFCMULCPHZrmb: - case X86::VFCMULCPHZ128rmbk: - case X86::VFCMULCPHZ256rmbk: - case X86::VFCMULCPHZrmbk: - case X86::VFCMULCPHZ128rmbkz: - case X86::VFCMULCPHZ256rmbkz: - case X86::VFCMULCPHZrmbkz: - case X86::VFCMULCPHZ128rmk: - case X86::VFCMULCPHZ256rmk: - case X86::VFCMULCPHZrmk: - case X86::VFCMULCPHZ128rmkz: - case X86::VFCMULCPHZ256rmkz: - case X86::VFCMULCPHZrmkz: - case X86::VFCMULCPHZ128rr: - case X86::VFCMULCPHZ256rr: - case X86::VFCMULCPHZrr: - case X86::VFCMULCPHZ128rrk: - case X86::VFCMULCPHZ256rrk: - case X86::VFCMULCPHZrrk: - case X86::VFCMULCPHZ128rrkz: - case X86::VFCMULCPHZ256rrkz: - case X86::VFCMULCPHZrrkz: - case X86::VFCMULCPHZrrb: - case X86::VFCMULCPHZrrbk: - case X86::VFCMULCPHZrrbkz: - case X86::VFCMULCSHZrm: - case X86::VFCMULCSHZrmk: - case X86::VFCMULCSHZrmkz: - case X86::VFCMULCSHZrr: - case X86::VFCMULCSHZrrb: - case X86::VFCMULCSHZrrbk: - case X86::VFCMULCSHZrrbkz: - case X86::VFCMULCSHZrrk: - case X86::VFCMULCSHZrrkz: - case X86::VFMULCPHZ128rm: - case X86::VFMULCPHZ256rm: - case X86::VFMULCPHZrm: - case X86::VFMULCPHZ128rmb: - case X86::VFMULCPHZ256rmb: - case X86::VFMULCPHZrmb: - case X86::VFMULCPHZ128rmbk: - case X86::VFMULCPHZ256rmbk: - case X86::VFMULCPHZrmbk: - case X86::VFMULCPHZ128rmbkz: - case X86::VFMULCPHZ256rmbkz: - case X86::VFMULCPHZrmbkz: - case X86::VFMULCPHZ128rmk: - case X86::VFMULCPHZ256rmk: - case X86::VFMULCPHZrmk: - case X86::VFMULCPHZ128rmkz: - case X86::VFMULCPHZ256rmkz: - case X86::VFMULCPHZrmkz: - case X86::VFMULCPHZ128rr: - case X86::VFMULCPHZ256rr: - case X86::VFMULCPHZrr: - case X86::VFMULCPHZ128rrk: - case X86::VFMULCPHZ256rrk: - case X86::VFMULCPHZrrk: - case X86::VFMULCPHZ128rrkz: - case X86::VFMULCPHZ256rrkz: - case X86::VFMULCPHZrrkz: - case X86::VFMULCPHZrrb: - case X86::VFMULCPHZrrbk: - case X86::VFMULCPHZrrbkz: - case X86::VFMULCSHZrm: - case X86::VFMULCSHZrmk: - case X86::VFMULCSHZrmkz: - case X86::VFMULCSHZrr: - case X86::VFMULCSHZrrb: - case X86::VFMULCSHZrrbk: - case X86::VFMULCSHZrrbkz: - case X86::VFMULCSHZrrk: - case X86::VFMULCSHZrrkz: { - unsigned Dest = Inst.getOperand(0).getReg(); - for (unsigned i = 1; i < Inst.getNumOperands(); i++) - if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg()) - return Warning(Ops[0]->getStartLoc(), "Destination register should be " - "distinct from source registers"); - break; - } + } else if (isVGATHERDPD(Opcode) || isVGATHERDPS(Opcode) || + isVGATHERQPD(Opcode) || isVGATHERQPS(Opcode) || + isVPGATHERDD(Opcode) || isVPGATHERDQ(Opcode) || + isVPGATHERQD(Opcode) || isVPGATHERQQ(Opcode)) { + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; + if (HasEVEX) { + unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Index = MRI->getEncodingValue( + Inst.getOperand(4 + X86::AddrIndexReg).getReg()); + if (Dest == Index) + return Warning(Ops[0]->getStartLoc(), "index and destination registers " + "should be distinct"); + } else { + unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + unsigned Index = MRI->getEncodingValue( + Inst.getOperand(3 + X86::AddrIndexReg).getReg()); + if (Dest == Mask || Dest == Index || Mask == Index) + return Warning(Ops[0]->getStartLoc(), "mask, index, and destination " + "registers should be distinct"); + } } - const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to // check this with the legacy encoding, VEX/EVEX/XOP don't use REX. - if ((MCID.TSFlags & X86II::EncodingMask) == 0) { + if ((TSFlags & X86II::EncodingMask) == 0) { MCPhysReg HReg = X86::NoRegister; - bool UsesRex = MCID.TSFlags & X86II::REX_W; + bool UsesRex = TSFlags & X86II::REX_W; unsigned NumOps = Inst.getNumOperands(); for (unsigned i = 0; i != NumOps; ++i) { const MCOperand &MO = Inst.getOperand(i); @@ -4313,15 +4163,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode // when matching the instruction. - if (ForcedDataPrefix == X86::Mode32Bit) - SwitchMode(X86::Mode32Bit); + if (ForcedDataPrefix == X86::Is32Bit) + SwitchMode(X86::Is32Bit); // First, try a direct match. FeatureBitset MissingFeatures; unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, isParsingIntelSyntax()); - if (ForcedDataPrefix == X86::Mode32Bit) { - SwitchMode(X86::Mode16Bit); + if (ForcedDataPrefix == X86::Is32Bit) { + SwitchMode(X86::Is16Bit); ForcedDataPrefix = 0; } switch (OriginalError) { @@ -4840,8 +4690,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) { if (getParser().parseAbsoluteExpression(Control)) return true; } - if (getParser().parseToken(AsmToken::EndOfStatement, - "unexpected token in '.nops' directive")) + if (getParser().parseEOL()) return true; if (NumBytes <= 0) { @@ -4863,7 +4712,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) { /// parseDirectiveEven /// ::= .even bool X86AsmParser::parseDirectiveEven(SMLoc L) { - if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) + if (parseEOL()) return false; const MCSection *Section = getStreamer().getCurrentSectionOnly(); @@ -4871,7 +4720,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { getStreamer().initSections(false, getSTI()); Section = getStreamer().getCurrentSectionOnly(); } - if (Section->UseCodeAlign()) + if (Section->useCodeAlign()) getStreamer().emitCodeAlignment(2, &getSTI(), 0); else getStreamer().emitValueToAlignment(2, 0, 1, 0); @@ -4886,7 +4735,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { if (IDVal == ".code16") { Parser.Lex(); if (!is16BitMode()) { - SwitchMode(X86::Mode16Bit); + SwitchMode(X86::Is16Bit); getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); } } else if (IDVal == ".code16gcc") { @@ -4894,19 +4743,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { Parser.Lex(); Code16GCC = true; if (!is16BitMode()) { - SwitchMode(X86::Mode16Bit); + SwitchMode(X86::Is16Bit); getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); } } else if (IDVal == ".code32") { Parser.Lex(); if (!is32BitMode()) { - SwitchMode(X86::Mode32Bit); + SwitchMode(X86::Is32Bit); getParser().getStreamer().emitAssemblerFlag(MCAF_Code32); } } else if (IDVal == ".code64") { Parser.Lex(); if (!is64BitMode()) { - SwitchMode(X86::Mode64Bit); + SwitchMode(X86::Is64Bit); getParser().getStreamer().emitAssemblerFlag(MCAF_Code64); } } else { @@ -5035,7 +4884,7 @@ bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) { return TokError("unexpected token in directive"); getParser().Lex(); - getStreamer().EmitWinCFIPushReg(Reg, Loc); + getStreamer().emitWinCFIPushReg(Reg, Loc); return false; } @@ -5055,7 +4904,7 @@ bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) { return TokError("unexpected token in directive"); getParser().Lex(); - getStreamer().EmitWinCFISetFrame(Reg, Off, Loc); + getStreamer().emitWinCFISetFrame(Reg, Off, Loc); return false; } @@ -5075,7 +4924,7 @@ bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) { return TokError("unexpected token in directive"); getParser().Lex(); - getStreamer().EmitWinCFISaveReg(Reg, Off, Loc); + getStreamer().emitWinCFISaveReg(Reg, Off, Loc); return false; } @@ -5095,7 +4944,7 @@ bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) { return TokError("unexpected token in directive"); getParser().Lex(); - getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc); + getStreamer().emitWinCFISaveXMM(Reg, Off, Loc); return false; } @@ -5116,7 +4965,7 @@ bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) { return TokError("unexpected token in directive"); getParser().Lex(); - getStreamer().EmitWinCFIPushFrame(Code, Loc); + getStreamer().emitWinCFIPushFrame(Code, Loc); return false; } diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 67b1244708a8..075b800f9e20 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -17,6 +17,8 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" #include "llvm/Support/SMLoc.h" #include @@ -35,6 +37,10 @@ struct X86Operand final : public MCParsedAsmOperand { void *OpDecl; bool AddressOf; + /// This used for inline asm which may specify base reg and index reg for + /// MemOp. e.g. ARR[eax + ecx*4], so no extra reg can be used for MemOp. + bool UseUpRegs = false; + struct TokOp { const char *Data; unsigned Length; @@ -66,6 +72,11 @@ struct X86Operand final : public MCParsedAsmOperand { /// If the memory operand is unsized and there are multiple instruction /// matches, prefer the one with this size. unsigned FrontendSize; + + /// If false, then this operand must be a memory operand for an indirect + /// branch instruction. Otherwise, this operand may belong to either a + /// direct or indirect branch instruction. + bool MaybeDirectBranchDest; }; union { @@ -203,6 +214,10 @@ struct X86Operand final : public MCParsedAsmOperand { assert(Kind == Memory && "Invalid access!"); return Mem.FrontendSize; } + bool isMaybeDirectBranchDest() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.MaybeDirectBranchDest; + } bool isToken() const override {return Kind == Token; } @@ -285,12 +300,6 @@ struct X86Operand final : public MCParsedAsmOperand { bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; } - bool isMemPlaceholder(const MCInstrDesc &Desc) const override { - // Only MS InlineAsm uses global variables with registers rather than - // rip/eip. - return isMem() && !Mem.DefaultBaseReg && Mem.FrontendSize; - } - bool needAddressOf() const override { return AddressOf; } bool isMem() const override { return Kind == Memory; } @@ -374,8 +383,9 @@ struct X86Operand final : public MCParsedAsmOperand { bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1; + !getMemIndexReg() && getMemScale() == 1 && isMaybeDirectBranchDest(); } + bool isAVX512RC() const{ return isImm(); } @@ -384,6 +394,8 @@ struct X86Operand final : public MCParsedAsmOperand { return isAbsMem() && Mem.ModeSize == 16; } + bool isMemUseUpRegs() const override { return UseUpRegs; } + bool isSrcIdx() const { return !getMemIndexReg() && getMemScale() == 1 && (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI || @@ -669,7 +681,8 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr, unsigned FrontendSize = 0) { + void *OpDecl = nullptr, unsigned FrontendSize = 0, + bool UseUpRegs = false, bool MaybeDirectBranchDest = true) { auto Res = std::make_unique(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -680,6 +693,8 @@ struct X86Operand final : public MCParsedAsmOperand { Res->Mem.Size = Size; Res->Mem.ModeSize = ModeSize; Res->Mem.FrontendSize = FrontendSize; + Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest; + Res->UseUpRegs = UseUpRegs; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; @@ -693,7 +708,8 @@ struct X86Operand final : public MCParsedAsmOperand { SMLoc EndLoc, unsigned Size = 0, unsigned DefaultBaseReg = X86::NoRegister, StringRef SymName = StringRef(), void *OpDecl = nullptr, - unsigned FrontendSize = 0) { + unsigned FrontendSize = 0, bool UseUpRegs = false, + bool MaybeDirectBranchDest = true) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) && @@ -712,6 +728,8 @@ struct X86Operand final : public MCParsedAsmOperand { Res->Mem.Size = Size; Res->Mem.ModeSize = ModeSize; Res->Mem.FrontendSize = FrontendSize; + Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest; + Res->UseUpRegs = UseUpRegs; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 908eb6d1fab1..1da6bf86397e 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -493,16 +493,15 @@ static int readPrefixes(struct InternalInstruction *insn) { insn->displacementSize = (insn->hasAdSize ? 2 : 4); insn->immediateSize = (insn->hasOpSize ? 2 : 4); } else if (insn->mode == MODE_64BIT) { + insn->displacementSize = 4; if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { insn->registerSize = 8; insn->addressSize = (insn->hasAdSize ? 4 : 8); - insn->displacementSize = 4; insn->immediateSize = 4; insn->hasOpSize = false; } else { insn->registerSize = (insn->hasOpSize ? 2 : 4); insn->addressSize = (insn->hasAdSize ? 4 : 8); - insn->displacementSize = (insn->hasOpSize ? 2 : 4); insn->immediateSize = (insn->hasOpSize ? 2 : 4); } } @@ -1722,13 +1721,13 @@ X86GenericDisassembler::X86GenericDisassembler( std::unique_ptr MII) : MCDisassembler(STI, Ctx), MII(std::move(MII)) { const FeatureBitset &FB = STI.getFeatureBits(); - if (FB[X86::Mode16Bit]) { + if (FB[X86::Is16Bit]) { fMode = MODE_16BIT; return; - } else if (FB[X86::Mode32Bit]) { + } else if (FB[X86::Is32Bit]) { fMode = MODE_32BIT; return; - } else if (FB[X86::Mode64Bit]) { + } else if (FB[X86::Is64Bit]) { fMode = MODE_64BIT; return; } @@ -1801,46 +1800,6 @@ static void translateRegister(MCInst &mcInst, Reg reg) { mcInst.addOperand(MCOperand::createReg(llvmRegnum)); } -/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the -/// immediate Value in the MCInst. -/// -/// @param Value - The immediate Value, has had any PC adjustment made by -/// the caller. -/// @param isBranch - If the instruction is a branch instruction -/// @param Address - The starting address of the instruction -/// @param Offset - The byte offset to this immediate in the instruction -/// @param Width - The byte width of this immediate in the instruction -/// -/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was -/// called then that function is called to get any symbolic information for the -/// immediate in the instruction using the Address, Offset and Width. If that -/// returns non-zero then the symbolic information it returns is used to create -/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() -/// returns zero and isBranch is true then a symbol look up for immediate Value -/// is done and if a symbol is found an MCExpr is created with that, else -/// an MCExpr with the immediate Value is created. This function returns true -/// if it adds an operand to the MCInst and false otherwise. -static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, - uint64_t Address, uint64_t Offset, - uint64_t Width, MCInst &MI, - const MCDisassembler *Dis) { - return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, - Offset, Width); -} - -/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being -/// referenced by a load instruction with the base register that is the rip. -/// These can often be addresses in a literal pool. The Address of the -/// instruction and its immediate Value are used to determine the address -/// being referenced in the literal pool entry. The SymbolLookUp call back will -/// return a pointer to a literal 'C' string if the referenced address is an -/// address into a section with 'C' string literals. -static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, - const void *Decoder) { - const MCDisassembler *Dis = static_cast(Decoder); - Dis->tryAddingPcLoadReferenceComment(Value, Address); -} - static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = { 0, // SEG_OVERRIDE_NONE X86::CS, @@ -1914,8 +1873,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, uint64_t pcrel = 0; if (type == TYPE_REL) { isBranch = true; - pcrel = insn.startLocation + - insn.immediateOffset + insn.immediateSize; + pcrel = insn.startLocation + insn.length; switch (operand.encoding) { default: break; @@ -1990,9 +1948,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, break; } - if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, - insn.immediateOffset, insn.immediateSize, - mcInst, Dis)) + if (!Dis->tryAddingSymbolicOperand( + mcInst, immediate + pcrel, insn.startLocation, isBranch, + insn.immediateOffset, insn.immediateSize, insn.length)) mcInst.addOperand(MCOperand::createImm(immediate)); if (type == TYPE_MOFFS) { @@ -2129,11 +2087,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, return true; } if (insn.mode == MODE_64BIT){ - pcrel = insn.startLocation + - insn.displacementOffset + insn.displacementSize; - tryAddingPcLoadReferenceComment(insn.startLocation + - insn.displacementOffset, - insn.displacement + pcrel, Dis); + pcrel = insn.startLocation + insn.length; + Dis->tryAddingPcLoadReferenceComment(insn.displacement + pcrel, + insn.startLocation + + insn.displacementOffset); // Section 2.2.1.6 baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP : X86::RIP); @@ -2193,9 +2150,13 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, mcInst.addOperand(baseReg); mcInst.addOperand(scaleAmount); mcInst.addOperand(indexReg); - if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false, - insn.startLocation, insn.displacementOffset, - insn.displacementSize, mcInst, Dis)) + + const uint8_t dispSize = + (insn.eaDisplacement == EA_DISP_NONE) ? 0 : insn.displacementSize; + + if (!Dis->tryAddingSymbolicOperand( + mcInst, insn.displacement + pcrel, insn.startLocation, false, + insn.displacementOffset, dispSize, insn.length)) mcInst.addOperand(displacement); mcInst.addOperand(segmentReg); return false; diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 24d26751f0a1..61e1b6b27a85 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -35,7 +35,7 @@ public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~X86InstrPostProcess() {} + ~X86InstrPostProcess() = default; void postProcessInstruction(std::unique_ptr &Inst, const MCInst &MCI) override; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index baacf2f46183..6fd3db4515ec 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -46,7 +46,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (CommentStream) HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); - printInstFlags(MI, OS); + printInstFlags(MI, OS, STI); // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". @@ -55,7 +55,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, // InstrInfo.td as soon as Requires clause is supported properly // for InstAlias. if (MI->getOpcode() == X86::CALLpcrel32 && - (STI.getFeatureBits()[X86::Mode64Bit])) { + (STI.getFeatureBits()[X86::Is64Bit])) { OS << "\tcallq\t"; printPCRelImm(MI, Address, 0, OS); } @@ -65,8 +65,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, // 0x66 to be interpreted as "data16" by the asm printer. // Thus we add an adjustment here in order to print the "right" instruction. else if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { - OS << "\tdata32"; + STI.getFeatureBits()[X86::Is16Bit]) { + OS << "\tdata32"; } // Try to print any aliases first. else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS)) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3df48b466d07..2d92b8d5b574 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -8,6 +8,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86InstrRelaxTables.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" @@ -222,87 +223,7 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) { static unsigned getRelaxedOpcodeArith(const MCInst &Inst) { unsigned Op = Inst.getOpcode(); - switch (Op) { - default: - return Op; - - // IMUL - case X86::IMUL16rri8: return X86::IMUL16rri; - case X86::IMUL16rmi8: return X86::IMUL16rmi; - case X86::IMUL32rri8: return X86::IMUL32rri; - case X86::IMUL32rmi8: return X86::IMUL32rmi; - case X86::IMUL64rri8: return X86::IMUL64rri32; - case X86::IMUL64rmi8: return X86::IMUL64rmi32; - - // AND - case X86::AND16ri8: return X86::AND16ri; - case X86::AND16mi8: return X86::AND16mi; - case X86::AND32ri8: return X86::AND32ri; - case X86::AND32mi8: return X86::AND32mi; - case X86::AND64ri8: return X86::AND64ri32; - case X86::AND64mi8: return X86::AND64mi32; - - // OR - case X86::OR16ri8: return X86::OR16ri; - case X86::OR16mi8: return X86::OR16mi; - case X86::OR32ri8: return X86::OR32ri; - case X86::OR32mi8: return X86::OR32mi; - case X86::OR64ri8: return X86::OR64ri32; - case X86::OR64mi8: return X86::OR64mi32; - - // XOR - case X86::XOR16ri8: return X86::XOR16ri; - case X86::XOR16mi8: return X86::XOR16mi; - case X86::XOR32ri8: return X86::XOR32ri; - case X86::XOR32mi8: return X86::XOR32mi; - case X86::XOR64ri8: return X86::XOR64ri32; - case X86::XOR64mi8: return X86::XOR64mi32; - - // ADD - case X86::ADD16ri8: return X86::ADD16ri; - case X86::ADD16mi8: return X86::ADD16mi; - case X86::ADD32ri8: return X86::ADD32ri; - case X86::ADD32mi8: return X86::ADD32mi; - case X86::ADD64ri8: return X86::ADD64ri32; - case X86::ADD64mi8: return X86::ADD64mi32; - - // ADC - case X86::ADC16ri8: return X86::ADC16ri; - case X86::ADC16mi8: return X86::ADC16mi; - case X86::ADC32ri8: return X86::ADC32ri; - case X86::ADC32mi8: return X86::ADC32mi; - case X86::ADC64ri8: return X86::ADC64ri32; - case X86::ADC64mi8: return X86::ADC64mi32; - - // SUB - case X86::SUB16ri8: return X86::SUB16ri; - case X86::SUB16mi8: return X86::SUB16mi; - case X86::SUB32ri8: return X86::SUB32ri; - case X86::SUB32mi8: return X86::SUB32mi; - case X86::SUB64ri8: return X86::SUB64ri32; - case X86::SUB64mi8: return X86::SUB64mi32; - - // SBB - case X86::SBB16ri8: return X86::SBB16ri; - case X86::SBB16mi8: return X86::SBB16mi; - case X86::SBB32ri8: return X86::SBB32ri; - case X86::SBB32mi8: return X86::SBB32mi; - case X86::SBB64ri8: return X86::SBB64ri32; - case X86::SBB64mi8: return X86::SBB64mi32; - - // CMP - case X86::CMP16ri8: return X86::CMP16ri; - case X86::CMP16mi8: return X86::CMP16mi; - case X86::CMP32ri8: return X86::CMP32ri; - case X86::CMP32mi8: return X86::CMP32mi; - case X86::CMP64ri8: return X86::CMP64ri32; - case X86::CMP64mi8: return X86::CMP64mi32; - - // PUSH - case X86::PUSH32i8: return X86::PUSHi32; - case X86::PUSH16i8: return X86::PUSHi16; - case X86::PUSH64i8: return X86::PUSH64i32; - } + return X86::getRelaxedOpcodeArith(Op); } static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) { @@ -372,7 +293,7 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst, /// - If the instruction has a ESP/EBP base register, use SS. /// - Otherwise use DS. uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const { - assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) && + assert((STI.hasFeature(X86::Is32Bit) || STI.hasFeature(X86::Is64Bit)) && "Prefixes can be added only in 32-bit or 64-bit mode."); const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); uint64_t TSFlags = Desc.TSFlags; @@ -413,7 +334,7 @@ uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const { if (SegmentReg != 0) return X86::getSegmentOverridePrefixForReg(SegmentReg); - if (STI.hasFeature(X86::Mode64Bit)) + if (STI.hasFeature(X86::Is64Bit)) return X86::CS_Encoding; if (MemoryOperand >= 0) { @@ -572,7 +493,7 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const { return false; // Branches only need to be aligned in 32-bit or 64-bit mode. - if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit))) + if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit))) return false; return true; @@ -834,7 +755,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, void X86AsmBackend::relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const { // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. - bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit]; unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode); if (RelaxedOp == Inst.getOpcode()) { @@ -853,7 +774,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst, static bool isFullyRelaxed(const MCRelaxableFragment &RF) { auto &Inst = RF.getInst(); auto &STI = *RF.getSubtargetInfo(); - bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit]; return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode(); } @@ -1077,9 +998,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, } unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const { - if (STI.hasFeature(X86::Mode16Bit)) + if (STI.hasFeature(X86::Is16Bit)) return 4; - if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) + if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Is64Bit)) return 1; if (STI.getFeatureBits()[X86::TuningFast7ByteNOP]) return 7; @@ -1134,7 +1055,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, }; const char(*Nops)[11] = - STI->getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit; + STI->getFeatureBits()[X86::Is16Bit] ? Nops16Bit : Nops32Bit; uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(*STI); @@ -1449,7 +1370,6 @@ public: unsigned InstrOffset = 0; unsigned StackAdjust = 0; unsigned StackSize = 0; - unsigned NumDefCFAOffsets = 0; int MinAbsOffset = std::numeric_limits::max(); for (const MCCFIInstruction &Inst : Instrs) { @@ -1457,7 +1377,7 @@ public: default: // Any other CFI directives indicate a frame that we aren't prepared // to represent via compact unwind, so just bail out. - return 0; + return CU::UNWIND_MODE_DWARF; case MCCFIInstruction::OpDefCfaRegister: { // Defines a frame pointer. E.g. // @@ -1471,7 +1391,7 @@ public: // generate a compact unwinding representation, so bail out. if (*MRI.getLLVMRegNum(Inst.getRegister(), true) != (Is64Bit ? X86::RBP : X86::EBP)) - return 0; + return CU::UNWIND_MODE_DWARF; // Reset the counts. memset(SavedRegs, 0, sizeof(SavedRegs)); @@ -1497,7 +1417,6 @@ public: // .cfi_def_cfa_offset 80 // StackSize = Inst.getOffset() / StackDivide; - ++NumDefCFAOffsets; break; } case MCCFIInstruction::OpOffset: { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 167580ec1ed0..e78e98cfc09e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -18,10 +18,11 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Casting.h" -#include +#include "llvm/Support/raw_ostream.h" #include +#include using namespace llvm; @@ -349,7 +350,8 @@ void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo, } } -void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { +void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O, + const MCSubtargetInfo &STI) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t TSFlags = Desc.TSFlags; unsigned Flags = MI->getFlags(); @@ -379,6 +381,20 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { O << "\t{disp8}"; else if (Flags & X86::IP_USE_DISP32) O << "\t{disp32}"; + + // Determine where the memory operand starts, if present + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + if (MemoryOperand != -1) + MemoryOperand += X86II::getOperandBias(Desc); + + // Address-Size override prefix + if (Flags & X86::IP_HAS_AD_SIZE && + !X86_MC::needsAddressSizeOverride(*MI, STI, MemoryOperand, TSFlags)) { + if (STI.hasFeature(X86::Is16Bit) || STI.hasFeature(X86::Is64Bit)) + O << "\taddr32\t"; + else if (STI.hasFeature(X86::Is32Bit)) + O << "\taddr16\t"; + } } void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index fd82bdcd1a23..0cb5bf014b20 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -33,7 +33,8 @@ public: raw_ostream &O); protected: - void printInstFlags(const MCInst *MI, raw_ostream &O); + void printInstFlags(const MCInst *MI, raw_ostream &O, + const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp new file mode 100644 index 000000000000..901082ce6cf3 --- /dev/null +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp @@ -0,0 +1,165 @@ +//===- X86InstrRelaxTables.cpp - X86 Instruction Relaxation Tables -*- C++ -*-// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 instruction relaxation tables. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrRelaxTables.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/STLExtras.h" + +using namespace llvm; + +// These tables are sorted by their ShortOp value allowing them to be binary +// searched at runtime without the need for additional storage. The enum values +// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which +// makes sorting these tables a simple matter of alphabetizing the table. +static const X86InstrRelaxTableEntry InstrRelaxTable[] = { + // ADC + { X86::ADC16mi8, X86::ADC16mi }, + { X86::ADC16ri8, X86::ADC16ri }, + { X86::ADC32mi8, X86::ADC32mi }, + { X86::ADC32ri8, X86::ADC32ri }, + { X86::ADC64mi8, X86::ADC64mi32 }, + { X86::ADC64ri8, X86::ADC64ri32 }, + // ADD + { X86::ADD16mi8, X86::ADD16mi }, + { X86::ADD16ri8, X86::ADD16ri }, + { X86::ADD32mi8, X86::ADD32mi }, + { X86::ADD32ri8, X86::ADD32ri }, + { X86::ADD64mi8, X86::ADD64mi32 }, + { X86::ADD64ri8, X86::ADD64ri32 }, + // AND + { X86::AND16mi8, X86::AND16mi }, + { X86::AND16ri8, X86::AND16ri }, + { X86::AND32mi8, X86::AND32mi }, + { X86::AND32ri8, X86::AND32ri }, + { X86::AND64mi8, X86::AND64mi32 }, + { X86::AND64ri8, X86::AND64ri32 }, + // CMP + { X86::CMP16mi8, X86::CMP16mi }, + { X86::CMP16ri8, X86::CMP16ri }, + { X86::CMP32mi8, X86::CMP32mi }, + { X86::CMP32ri8, X86::CMP32ri }, + { X86::CMP64mi8, X86::CMP64mi32 }, + { X86::CMP64ri8, X86::CMP64ri32 }, + // IMUL + { X86::IMUL16rmi8, X86::IMUL16rmi }, + { X86::IMUL16rri8, X86::IMUL16rri }, + { X86::IMUL32rmi8, X86::IMUL32rmi }, + { X86::IMUL32rri8, X86::IMUL32rri }, + { X86::IMUL64rmi8, X86::IMUL64rmi32 }, + { X86::IMUL64rri8, X86::IMUL64rri32 }, + // OR + { X86::OR16mi8, X86::OR16mi }, + { X86::OR16ri8, X86::OR16ri }, + { X86::OR32mi8, X86::OR32mi }, + { X86::OR32ri8, X86::OR32ri }, + { X86::OR64mi8, X86::OR64mi32 }, + { X86::OR64ri8, X86::OR64ri32 }, + // PUSH + { X86::PUSH16i8, X86::PUSHi16 }, + { X86::PUSH32i8, X86::PUSHi32 }, + { X86::PUSH64i8, X86::PUSH64i32 }, + // SBB + { X86::SBB16mi8, X86::SBB16mi }, + { X86::SBB16ri8, X86::SBB16ri }, + { X86::SBB32mi8, X86::SBB32mi }, + { X86::SBB32ri8, X86::SBB32ri }, + { X86::SBB64mi8, X86::SBB64mi32 }, + { X86::SBB64ri8, X86::SBB64ri32 }, + // SUB + { X86::SUB16mi8, X86::SUB16mi }, + { X86::SUB16ri8, X86::SUB16ri }, + { X86::SUB32mi8, X86::SUB32mi }, + { X86::SUB32ri8, X86::SUB32ri }, + { X86::SUB64mi8, X86::SUB64mi32 }, + { X86::SUB64ri8, X86::SUB64ri32 }, + // XOR + { X86::XOR16mi8, X86::XOR16mi }, + { X86::XOR16ri8, X86::XOR16ri }, + { X86::XOR32mi8, X86::XOR32mi }, + { X86::XOR32ri8, X86::XOR32ri }, + { X86::XOR64mi8, X86::XOR64mi32 }, + { X86::XOR64ri8, X86::XOR64ri32 }, +}; + +static const X86InstrRelaxTableEntry * +lookupRelaxTableImpl(ArrayRef Table, + unsigned ShortOp) { +#ifndef NDEBUG + // Make sure the tables are sorted. + static std::atomic RelaxTableChecked(false); + if (!RelaxTableChecked.load(std::memory_order_relaxed)) { + assert(llvm::is_sorted(InstrRelaxTable) && + std::adjacent_find(std::begin(InstrRelaxTable), + std::end(InstrRelaxTable)) == + std::end(InstrRelaxTable) && + "InstrRelaxTable is not sorted and unique!"); + RelaxTableChecked.store(true, std::memory_order_relaxed); + } +#endif + + const X86InstrRelaxTableEntry *Data = llvm::lower_bound(Table, ShortOp); + if (Data != Table.end() && Data->KeyOp == ShortOp) + return Data; + return nullptr; +} + +const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) { + return lookupRelaxTableImpl(InstrRelaxTable, ShortOp); +} + +namespace { + +// This class stores the short form tables. It is instantiated as a +// ManagedStatic to lazily init the short form table. +struct X86ShortFormTable { + // Stores relaxation table entries sorted by relaxed form opcode. + SmallVector Table; + + X86ShortFormTable() { + for (const X86InstrRelaxTableEntry &Entry : InstrRelaxTable) + Table.push_back({Entry.DstOp, Entry.KeyOp}); + + llvm::sort(Table); + + // Now that it's sorted, ensure its unique. + assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() && + "Short form table is not unique!"); + } +}; +} // namespace + +static ManagedStatic ShortTable; + +const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) { + auto &Table = ShortTable->Table; + auto I = llvm::lower_bound(Table, RelaxOp); + if (I != Table.end() && I->KeyOp == RelaxOp) + return &*I; + return nullptr; +} + +namespace llvm { + +/// Get the short instruction opcode for a given relaxed opcode. +unsigned X86::getShortOpcodeArith(unsigned RelaxOp) { + if (const X86InstrRelaxTableEntry *I = lookupShortTable(RelaxOp)) + return I->DstOp; + return RelaxOp; +} + +/// Get the relaxed instruction opcode for a given short opcode. +unsigned X86::getRelaxedOpcodeArith(unsigned ShortOp) { + if (const X86InstrRelaxTableEntry *I = lookupRelaxTable(ShortOp)) + return I->DstOp; + return ShortOp; +} +} // namespace llvm diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h new file mode 100644 index 000000000000..0551c1861a58 --- /dev/null +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h @@ -0,0 +1,54 @@ +//===-- X86InstrRelaxTables.h - X86 Instruction Relaxation Tables -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the interface to query the X86 instruction relaxation +// tables. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H +#define LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H + +#include + +namespace llvm { + +// This struct is used for both the relaxed and short tables. The KeyOp is used +// to determine the sorting order. +struct X86InstrRelaxTableEntry { + uint16_t KeyOp; + uint16_t DstOp; + + bool operator<(const X86InstrRelaxTableEntry &RHS) const { + return KeyOp < RHS.KeyOp; + } + bool operator==(const X86InstrRelaxTableEntry &RHS) const { + return KeyOp == RHS.KeyOp; + } + friend bool operator<(const X86InstrRelaxTableEntry &TE, unsigned Opcode) { + return TE.KeyOp < Opcode; + } +}; + +/// Look up the relaxed form table entry for a given \p ShortOp. +const X86InstrRelaxTableEntry *lookupRelaxTable(unsigned ShortOp); + +/// Look up the short form table entry for a given \p RelaxOp. +const X86InstrRelaxTableEntry *lookupShortTable(unsigned RelaxOp); + +namespace X86 { + +/// Get the short instruction opcode for a given relaxed opcode. +unsigned getShortOpcodeArith(unsigned RelaxOp); + +/// Get the relaxed instruction opcode for a given short opcode. +unsigned getRelaxedOpcodeArith(unsigned ShortOp); +} // namespace X86 +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index 48c335f9a777..2a2afa925a9c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -40,11 +40,11 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { - printInstFlags(MI, OS); + printInstFlags(MI, OS, STI); // In 16-bit mode, print data16 as data32. if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { + STI.getFeatureBits()[X86::Is16Bit]) { OS << "\tdata32"; } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS)) printInstruction(MI, Address, OS); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 4fa8bc64b245..a21bb6da86de 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include @@ -155,65 +156,6 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { return MCFixup::getKindForSize(Size, isPCRel); } -/// \param Op operand # of the memory operand. -/// -/// \returns true if the specified instruction has a 16-bit memory operand. -static bool is16BitMemOperand(const MCInst &MI, unsigned Op, - const MCSubtargetInfo &STI) { - const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); - - unsigned BaseReg = Base.getReg(); - unsigned IndexReg = Index.getReg(); - - if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0) - return true; - if ((BaseReg != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || - (IndexReg != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))) - return true; - return false; -} - -/// \param Op operand # of the memory operand. -/// -/// \returns true if the specified instruction has a 32-bit memory operand. -static bool is32BitMemOperand(const MCInst &MI, unsigned Op) { - const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) - return true; - if (BaseReg.getReg() == X86::EIP) { - assert(IndexReg.getReg() == 0 && "Invalid eip-based address."); - return true; - } - if (IndexReg.getReg() == X86::EIZ) - return true; - return false; -} - -/// \param Op operand # of the memory operand. -/// -/// \returns true if the specified instruction has a 64-bit memory operand. -#ifndef NDEBUG -static bool is64BitMemOperand(const MCInst &MI, unsigned Op) { - const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} -#endif - enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff }; /// Check if this expression starts with _GLOBAL_OFFSET_TABLE_ and if it is @@ -391,7 +333,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // Handle %rip relative addressing. if (BaseReg == X86::RIP || BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode - assert(STI.hasFeature(X86::Mode64Bit) && + assert(STI.hasFeature(X86::Is64Bit) && "Rip-relative addressing requires 64-bit mode"); assert(IndexReg.getReg() == 0 && !ForceSIB && "Invalid rip-relative address"); @@ -462,7 +404,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // 16-bit addressing forms of the ModR/M byte have a different encoding for // the R/M field and are far more limited in which registers can be used. - if (is16BitMemOperand(MI, Op, STI)) { + if (X86_MC::is16BitMemOperand(MI, Op, STI)) { if (BaseReg) { // For 32-bit addressing, the row and column values in Table 2-2 are // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with @@ -540,7 +482,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::ESP && // If there is no base register and we're in 64-bit mode, we need a SIB // byte to emit an addr that is just 'disp32' (the non-RIP relative form). - (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) { + (!STI.hasFeature(X86::Is64Bit) || BaseReg != 0)) { if (BaseReg == 0) { // [disp32] in X86-32 mode emitByte(modRMByte(0, RegOpcodeField, 5), OS); @@ -671,75 +613,29 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI, emitByte(0xF2, OS); // Emit the address size opcode prefix as needed. - bool NeedAddressOverride; - uint64_t AdSize = TSFlags & X86II::AdSizeMask; - if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) || - (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) || - (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) { - NeedAddressOverride = true; - } else if (MemoryOperand < 0) { - NeedAddressOverride = false; - } else if (STI.hasFeature(X86::Mode64Bit)) { - assert(!is16BitMemOperand(MI, MemoryOperand, STI)); - NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand); - } else if (STI.hasFeature(X86::Mode32Bit)) { - assert(!is64BitMemOperand(MI, MemoryOperand)); - NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI); - } else { - assert(STI.hasFeature(X86::Mode16Bit)); - assert(!is64BitMemOperand(MI, MemoryOperand)); - NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI); - } - - if (NeedAddressOverride) + if (X86_MC::needsAddressSizeOverride(MI, STI, MemoryOperand, TSFlags) || + Flags & X86::IP_HAS_AD_SIZE) emitByte(0x67, OS); - // Encoding type for this instruction. - uint64_t Encoding = TSFlags & X86II::EncodingMask; - bool HasREX = false; - if (Encoding) - emitVEXOpcodePrefix(MemoryOperand, MI, OS); - else - HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS); - uint64_t Form = TSFlags & X86II::FormMask; switch (Form) { default: break; case X86II::RawFrmDstSrc: { - unsigned siReg = MI.getOperand(1).getReg(); - assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) || - (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) || - (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) && - "SI and DI register sizes do not match"); // Emit segment override opcode prefix as needed (not for %ds). if (MI.getOperand(2).getReg() != X86::DS) emitSegmentOverridePrefix(2, MI, OS); - // Emit AdSize prefix as needed. - if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || - (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) - emitByte(0x67, OS); CurOp += 3; // Consume operands. break; } case X86II::RawFrmSrc: { - unsigned siReg = MI.getOperand(0).getReg(); // Emit segment override opcode prefix as needed (not for %ds). if (MI.getOperand(1).getReg() != X86::DS) emitSegmentOverridePrefix(1, MI, OS); - // Emit AdSize prefix as needed. - if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || - (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) - emitByte(0x67, OS); CurOp += 2; // Consume operands. break; } case X86II::RawFrmDst: { - unsigned siReg = MI.getOperand(0).getReg(); - // Emit AdSize prefix as needed. - if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) || - (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI)) - emitByte(0x67, OS); ++CurOp; // Consume operand. break; } @@ -750,6 +646,15 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI, } } + // REX prefix is optional, but if used must be immediately before the opcode + // Encoding type for this instruction. + uint64_t Encoding = TSFlags & X86II::EncodingMask; + bool HasREX = false; + if (Encoding) + emitVEXOpcodePrefix(MemoryOperand, MI, OS); + else + HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS); + return HasREX; } @@ -1347,7 +1252,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI, // Emit the operand size opcode prefix as needed. if ((TSFlags & X86II::OpSizeMask) == - (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16)) + (STI.hasFeature(X86::Is16Bit) ? X86II::OpSize32 : X86II::OpSize16)) emitByte(0x66, OS); // Emit the LOCK opcode prefix. @@ -1371,9 +1276,9 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI, } // Handle REX prefix. - assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) && + assert((STI.hasFeature(X86::Is64Bit) || !(TSFlags & X86II::REX_W)) && "REX.W requires 64bit mode."); - bool HasREX = STI.hasFeature(X86::Mode64Bit) + bool HasREX = STI.hasFeature(X86::Is64Bit) ? emitREXPrefix(MemOperand, MI, STI, OS) : false; @@ -1472,7 +1377,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::RawFrm: emitByte(BaseOpcode + OpcodeOffset, OS); - if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII)) + if (!STI.hasFeature(X86::Is64Bit) || !isPCRel32Branch(MI, MCII)) break; const MCOperand &Op = MI.getOperand(CurOp++); @@ -1842,7 +1747,6 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx) { return new X86MCCodeEmitter(MCII, Ctx); } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h index 532fecd9951b..cd2baeb1c98e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h @@ -18,6 +18,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" namespace llvm { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 8913e405539e..49660883ad83 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -72,6 +72,97 @@ bool X86_MC::hasLockPrefix(const MCInst &MI) { return MI.getFlags() & X86::IP_HAS_LOCK; } +static bool isMemOperand(const MCInst &MI, unsigned Op, unsigned RegClassID) { + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); + const MCRegisterClass &RC = X86MCRegisterClasses[RegClassID]; + + return (Base.isReg() && Base.getReg() != 0 && RC.contains(Base.getReg())) || + (Index.isReg() && Index.getReg() != 0 && RC.contains(Index.getReg())); +} + +bool X86_MC::is16BitMemOperand(const MCInst &MI, unsigned Op, + const MCSubtargetInfo &STI) { + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); + + if (STI.hasFeature(X86::Is16Bit) && Base.isReg() && Base.getReg() == 0 && + Index.isReg() && Index.getReg() == 0) + return true; + return isMemOperand(MI, Op, X86::GR16RegClassID); +} + +bool X86_MC::is32BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); + if (Base.isReg() && Base.getReg() == X86::EIP) { + assert(Index.isReg() && Index.getReg() == 0 && "Invalid eip-based address"); + return true; + } + if (Index.isReg() && Index.getReg() == X86::EIZ) + return true; + return isMemOperand(MI, Op, X86::GR32RegClassID); +} + +#ifndef NDEBUG +bool X86_MC::is64BitMemOperand(const MCInst &MI, unsigned Op) { + return isMemOperand(MI, Op, X86::GR64RegClassID); +} +#endif + +bool X86_MC::needsAddressSizeOverride(const MCInst &MI, + const MCSubtargetInfo &STI, + int MemoryOperand, uint64_t TSFlags) { + uint64_t AdSize = TSFlags & X86II::AdSizeMask; + bool Is16BitMode = STI.hasFeature(X86::Is16Bit); + bool Is32BitMode = STI.hasFeature(X86::Is32Bit); + bool Is64BitMode = STI.hasFeature(X86::Is64Bit); + if ((Is16BitMode && AdSize == X86II::AdSize32) || + (Is32BitMode && AdSize == X86II::AdSize16) || + (Is64BitMode && AdSize == X86II::AdSize32)) + return true; + uint64_t Form = TSFlags & X86II::FormMask; + switch (Form) { + default: + break; + case X86II::RawFrmDstSrc: { + unsigned siReg = MI.getOperand(1).getReg(); + assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) || + (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) || + (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) && + "SI and DI register sizes do not match"); + return (!Is32BitMode && siReg == X86::ESI) || + (Is32BitMode && siReg == X86::SI); + } + case X86II::RawFrmSrc: { + unsigned siReg = MI.getOperand(0).getReg(); + return (!Is32BitMode && siReg == X86::ESI) || + (Is32BitMode && siReg == X86::SI); + } + case X86II::RawFrmDst: { + unsigned siReg = MI.getOperand(0).getReg(); + return (!Is32BitMode && siReg == X86::EDI) || + (Is32BitMode && siReg == X86::DI); + } + } + + // Determine where the memory operand starts, if present. + if (MemoryOperand < 0) + return false; + + if (STI.hasFeature(X86::Is64Bit)) { + assert(!is16BitMemOperand(MI, MemoryOperand, STI)); + return is32BitMemOperand(MI, MemoryOperand); + } + if (STI.hasFeature(X86::Is32Bit)) { + assert(!is64BitMemOperand(MI, MemoryOperand)); + return is16BitMemOperand(MI, MemoryOperand, STI); + } + assert(STI.hasFeature(X86::Is16Bit)); + assert(!is64BitMemOperand(MI, MemoryOperand)); + return !is16BitMemOperand(MI, MemoryOperand, STI); +} + void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { // FIXME: TableGen these. for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 35604cd3ec0a..d0530bd4d650 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -63,6 +63,28 @@ void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI); /// Returns true if this instruction has a LOCK prefix. bool hasLockPrefix(const MCInst &MI); +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 16-bit memory operand. +bool is16BitMemOperand(const MCInst &MI, unsigned Op, + const MCSubtargetInfo &STI); + +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 32-bit memory operand. +bool is32BitMemOperand(const MCInst &MI, unsigned Op); + +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 64-bit memory operand. +#ifndef NDEBUG +bool is64BitMemOperand(const MCInst &MI, unsigned Op); +#endif + +/// Returns true if this instruction needs an Address-Size override prefix. +bool needsAddressSizeOverride(const MCInst &MI, const MCSubtargetInfo &STI, + int MemoryOperand, uint64_t TSFlags); + /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. /// do not need to go through TargetRegistry. MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, @@ -70,7 +92,6 @@ MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, } MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, @@ -142,4 +163,7 @@ MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned, #define GET_SUBTARGETINFO_ENUM #include "X86GenSubtargetInfo.inc" +#define GET_X86_MNEMONIC_TABLES_H +#include "X86GenMnemonicTables.inc" + #endif diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp new file mode 100644 index 000000000000..39b7f0f4160e --- /dev/null +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp @@ -0,0 +1,16 @@ +//===-- X86MnemonicTables.cpp - X86 Mnemonic Tables -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 mnemonic tables. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" + +#define GET_X86_MNEMONIC_TABLES_CPP +#include "X86GenMnemonicTables.inc" diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index c29211246123..36945d1f6746 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -9,6 +9,7 @@ #include "X86MCTargetDesc.h" #include "X86TargetStreamer.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCWin64EH.h" @@ -25,15 +26,15 @@ public: std::unique_ptr OW) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} - void EmitWinEHHandlerData(SMLoc Loc) override; - void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; - void EmitWindowsUnwindTables() override; - void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override; + void emitWinEHHandlerData(SMLoc Loc) override; + void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; + void emitWindowsUnwindTables() override; + void emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override; void finishImpl() override; }; -void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { - MCStreamer::EmitWinEHHandlerData(Loc); +void X86WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) { + MCStreamer::emitWinEHHandlerData(Loc); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section. @@ -41,17 +42,17 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true); } -void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { +void X86WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) { EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); } -void X86WinCOFFStreamer::EmitWindowsUnwindTables() { +void X86WinCOFFStreamer::emitWindowsUnwindTables() { if (!getNumWinFrameInfos()) return; EHStreamer.Emit(*this); } -void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { +void X86WinCOFFStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { X86TargetStreamer *XTS = static_cast(getTargetStreamer()); XTS->emitFPOData(ProcSym, Loc); @@ -59,7 +60,7 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { void X86WinCOFFStreamer::finishImpl() { emitFrames(nullptr); - EmitWindowsUnwindTables(); + emitWindowsUnwindTables(); MCWinCOFFStreamer::finishImpl(); } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index bf3f4e990ecc..f2827c568109 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCInstPrinter.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/FormattedStream.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 10e1c5d6ed38..7344900f2e31 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -79,6 +79,9 @@ FunctionPass *createX86DynAllocaExpander(); /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); +/// Return a pass that preconfig the tile registers before fast reg allocation. +FunctionPass *createX86FastPreTileConfigPass(); + /// Return a pass that config the tile registers after fast reg allocation. FunctionPass *createX86FastTileConfigPass(); @@ -175,6 +178,7 @@ void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); +void initializeX86FastPreTileConfigPass(PassRegistry &); void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 380507308c3d..a5c6b40c493c 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -18,13 +18,13 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// // X86 Subtarget state // - -def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", - "64-bit mode (x86_64)">; -def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true", - "32-bit mode (80386)">; -def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true", - "16-bit mode (i8086)">; +// disregarding specific ABI / programming model +def Is64Bit : SubtargetFeature<"64bit-mode", "Is64Bit", "true", + "64-bit mode (x86_64)">; +def Is32Bit : SubtargetFeature<"32bit-mode", "Is32Bit", "true", + "32-bit mode (80386)">; +def Is16Bit : SubtargetFeature<"16bit-mode", "Is16Bit", "true", + "16-bit mode (i8086)">; //===----------------------------------------------------------------------===// // X86 Subtarget ISA features @@ -34,16 +34,16 @@ def FeatureX87 : SubtargetFeature<"x87","HasX87", "true", "Enable X87 float instructions">; def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", - "Enable NOPL instruction">; + "Enable NOPL instruction (generally pentium pro+)">; -def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", +def FeatureCMOV : SubtargetFeature<"cmov","HasCMOV", "true", "Enable conditional move instructions">; -def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true", - "Support CMPXCHG8B instructions">; +def FeatureCX8 : SubtargetFeature<"cx8", "HasCX8", "true", + "Support CMPXCHG8B instructions">; def FeatureCRC32 : SubtargetFeature<"crc32", "HasCRC32", "true", - "Enable SSE 4.2 CRC32 instruction">; + "Enable SSE 4.2 CRC32 instruction (used when SSE4.2 is supported but function is GPR only)">; def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; @@ -98,11 +98,11 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", // feature, because SSE2 can be disabled (e.g. for compiling OS kernels) // without disabling 64-bit mode. Nothing should imply this feature bit. It // is used to enforce that only 64-bit capable CPUs are used in 64-bit mode. -def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", +def FeatureX86_64 : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions">; -def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", - "64-bit with cmpxchg16b", - [FeatureCMPXCHG8B]>; +def FeatureCX16 : SubtargetFeature<"cx16", "HasCX16", "true", + "64-bit with cmpxchg16b (this is true for most x86-64 chips, but not the first AMD chips)", + [FeatureCX8]>; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -119,7 +119,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", "Support 16-bit floating point conversion instructions", [FeatureAVX]>; -def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", +def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512", "Enable AVX-512 instructions", [FeatureAVX2, FeatureFMA, FeatureF16C]>; def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", @@ -198,7 +198,7 @@ def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", [FeatureFMA4]>; def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", "HasSSEUnalignedMem", "true", - "Allow unaligned memory operands with SSE instructions">; + "Allow unaligned memory operands with SSE instructions (this may require setting a configuration bit in the processor)">; def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; @@ -228,20 +228,22 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; +// Processor supports CET SHSTK - Control-Flow Enforcement Technology +// using Shadow Stack def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true", "Support CET Shadow-Stack instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; -def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true", +def FeatureLAHFSAHF64 : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true", "Support LAHF and SAHF instructions in 64-bit mode">; def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true", "Enable MONITORX/MWAITX timer functionality">; def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true", "Enable Cache Line Zero">; def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true", - "Enable Cache Demote">; + "Enable Cache Line Demote">; def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", "Support ptwrite instruction">; def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true", @@ -285,9 +287,9 @@ def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true", def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true", "platform configuration instruction">; def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", - "Support movdiri instruction">; + "Support movdiri instruction (direct store integer)">; def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", - "Support movdir64b instruction">; + "Support movdir64b instruction (direct store 64 bytes)">; // Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka // "string operations"). See "REP String Enhancement" in the Intel Software @@ -380,6 +382,17 @@ def FeatureTaggedGlobals "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits.">; +// Control codegen mitigation against Straight Line Speculation vulnerability. +def FeatureHardenSlsRet + : SubtargetFeature< + "harden-sls-ret", "HardenSlsRet", "true", + "Harden against straight line speculation across RET instructions.">; + +def FeatureHardenSlsIJmp + : SubtargetFeature< + "harden-sls-ijmp", "HardenSlsIJmp", "true", + "Harden against straight line speculation across indirect JMP instructions.">; + //===----------------------------------------------------------------------===// // X86 Subtarget Tuning features //===----------------------------------------------------------------------===// @@ -388,7 +401,7 @@ def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", - "PMULLD instruction is slow">; + "PMULLD instruction is slow (compared to PMULLW/PMULHW and PMULUDQ)">; def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow", "true", @@ -396,27 +409,31 @@ def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow", // FIXME: This should not apply to CPUs that do not have SSE. def TuningSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", - "IsUAMem16Slow", "true", + "IsUnalignedMem16Slow", "true", "Slow unaligned 16-byte memory access">; def TuningSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", - "IsUAMem32Slow", "true", + "IsUnalignedMem32Slow", "true", "Slow unaligned 32-byte memory access">; def TuningLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", - "Use LEA for adjusting the stack pointer">; + "Use LEA for adjusting the stack pointer (this is an optimization for Intel Atom processors)">; +// True if 8-bit divisions are significantly faster than +// 32-bit divisions and should be used when possible. def TuningSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", "Use 8-bit divide for positive values less than 256">; +// True if 32-bit divides are significantly faster than +// 64-bit divisions and should be used when possible. def TuningSlowDivide64 : SubtargetFeature<"idivq-to-divl", "HasSlowDivide64", "true", "Use 32-bit divide for positive values less than 2^32">; def TuningPadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", - "Pad short functions">; + "Pad short functions (to prevent a stall when returning too early)">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands @@ -425,15 +442,21 @@ def TuningSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops", "SlowTwoMemOps", "true", "Two memory operand instructions are slow">; -def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", +// True if the LEA instruction inputs have to be ready at address generation +// (AG) time. +def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LeaUsesAG", "true", "LEA instruction needs inputs at AG stage">; def TuningSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +// True if the LEA instruction has all three source operands: base, index, +// and offset or if the LEA instruction uses base and index registers where +// the base is EBP, RBP,or R13 def TuningSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", "LEA instruction with 3 ops or certain registers is slow">; +// True if INC and DEC instructions are slow when writing to flags def TuningSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; @@ -445,6 +468,31 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", "HasLZCNTFalseDeps", "true", "LZCNT/TZCNT have a false dependency on dest register">; +def TuningMULCFalseDeps : SubtargetFeature<"false-deps-mulc", + "HasMULCFalseDeps", "true", + "VF[C]MULCPH/SH has a false dependency on dest register">; + +def TuningPERMFalseDeps : SubtargetFeature<"false-deps-perm", + "HasPERMFalseDeps", "true", + "VPERMD/Q/PS/PD has a false dependency on dest register">; + +def TuningRANGEFalseDeps : SubtargetFeature<"false-deps-range", + "HasRANGEFalseDeps", "true", + "VRANGEPD/PS/SD/SS has a false dependency on dest register">; + +def TuningGETMANTFalseDeps : SubtargetFeature<"false-deps-getmant", + "HasGETMANTFalseDeps", "true", + "VGETMANTSS/SD/SH and VGETMANDPS/PD(memory version) has a" + " false dependency on dest register">; + +def TuningMULLQFalseDeps : SubtargetFeature<"false-deps-mullq", + "HasMULLQFalseDeps", "true", + "VPMULLQ has a false dependency on dest register">; + +def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking", + "HasSBBDepBreaking", "true", + "SBB with same register has no source dependency">; + // On recent X86 (port bound) processors, its preferable to combine to a single shuffle // using a variable mask over multiple fixed shuffles. def TuningFastVariableCrossLaneShuffle @@ -470,9 +518,14 @@ def TuningInsertVZEROUPPER // vectorized code we should care about the throughput of SQRT operations. // But if the code is scalar that probably means that the code has some kind of // dependency and we should care more about reducing the latency. + +// True if hardware SQRTSS instruction is at least as fast (latency) as +// RSQRTSS followed by a Newton-Raphson iteration. def TuningFastScalarFSQRT : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +// True if hardware SQRTPS/VSQRTPS instructions are at least as fast +// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. def TuningFastVectorFSQRT : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", "true", "Vector SQRT is fast (disable Newton-Raphson)">; @@ -529,7 +582,7 @@ def TuningMacroFusion // similar to Skylake Server (AVX-512). def TuningFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", - "Indicates if gather is reasonably fast">; + "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">; def TuningPrefer128Bit : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true", @@ -578,17 +631,13 @@ def TuningUseGLMDivSqrtCosts : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", "Use Goldmont specific floating point div/sqrt costs">; -// Enable use of alias analysis during code generation. -def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", - "Use alias analysis during codegen">; - //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. //===----------------------------------------------------------------------===// // Bonnell -def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; +def ProcIntelAtom : SubtargetFeature<"", "IsAtom", "true", "Is Intel Atom processor">; //===----------------------------------------------------------------------===// // Register File Description @@ -632,11 +681,11 @@ include "X86SchedIceLake.td" def ProcessorFeatures { // x86-64 and x86-64-v[234] list X86_64V1Features = [ - FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, Feature64Bit + FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureNOPL, FeatureX86_64, ]; list X86_64V2Features = !listconcat(X86_64V1Features, [ - FeatureCMPXCHG16B, FeatureLAHFSAHF, FeatureCRC32, FeaturePOPCNT, + FeatureCX16, FeatureLAHFSAHF64, FeatureCRC32, FeaturePOPCNT, FeatureSSE42 ]); list X86_64V3Features = !listconcat(X86_64V2Features, [ @@ -862,22 +911,27 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureUINTR]; - list SPRTuning = ICXTuning; + list SPRAdditionalTuning = [TuningMULCFalseDeps, + TuningPERMFalseDeps, + TuningRANGEFalseDeps, + TuningGETMANTFalseDeps, + TuningMULLQFalseDeps]; + list SPRTuning = !listconcat(ICXTuning, SPRAdditionalTuning); list SPRFeatures = !listconcat(ICXFeatures, SPRAdditionalFeatures); // Atom list AtomFeatures = [FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSSE3, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, FeatureMOVBE, - FeatureLAHFSAHF]; + FeatureLAHFSAHF64]; list AtomTuning = [ProcIntelAtom, TuningSlowUAMem16, TuningLEAForSP, @@ -968,25 +1022,26 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list ADLTuning = SKLTuning; + list ADLAdditionalTuning = [TuningPERMFalseDeps]; + list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); // Knights Landing list KNLFeatures = [FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, FeatureCRC32, FeaturePOPCNT, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, - FeatureLAHFSAHF, + FeatureLAHFSAHF64, FeatureAES, FeatureRDRAND, FeatureF16C, @@ -1018,41 +1073,43 @@ def ProcessorFeatures { // Barcelona list BarcelonaFeatures = [FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureSSE4A, Feature3DNowA, FeatureFXSR, FeatureNOPL, - FeatureCMPXCHG16B, + FeatureCX16, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureLAHFSAHF, + FeatureLAHFSAHF64, FeatureCMOV, - Feature64Bit]; + FeatureX86_64]; list BarcelonaTuning = [TuningFastScalarShiftMasks, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // Bobcat list BtVer1Features = [FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSSE3, FeatureSSE4A, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureLAHFSAHF]; + FeatureLAHFSAHF64]; list BtVer1Tuning = [TuningFast15ByteNOP, TuningFastScalarShiftMasks, TuningFastVectorShiftMasks, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // Jaguar @@ -1072,17 +1129,18 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningFastVectorShiftMasks, TuningFastMOVBE, + TuningSBBDepBreaking, TuningSlowSHLD]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); // Bulldozer list BdVer1Features = [FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureXOP, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, FeatureAES, FeatureCRC32, FeaturePRFCHW, @@ -1094,11 +1152,12 @@ def ProcessorFeatures { FeaturePOPCNT, FeatureXSAVE, FeatureLWP, - FeatureLAHFSAHF]; + FeatureLAHFSAHF64]; list BdVer1Tuning = [TuningSlowSHLD, TuningFast11ByteNOP, TuningFastScalarShiftMasks, TuningBranchFusion, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // PileDriver @@ -1140,15 +1199,15 @@ def ProcessorFeatures { FeatureCLFLUSHOPT, FeatureCLZERO, FeatureCMOV, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, FeatureCRC32, FeatureF16C, FeatureFMA, FeatureFSGSBase, FeatureFXSR, FeatureNOPL, - FeatureLAHFSAHF, + FeatureLAHFSAHF64, FeatureLZCNT, FeatureMMX, FeatureMOVBE, @@ -1169,9 +1228,13 @@ def ProcessorFeatures { TuningFastBEXTR, TuningFast15ByteNOP, TuningBranchFusion, + TuningFastScalarFSQRT, + TuningFastVectorFSQRT, TuningFastScalarShiftMasks, + TuningFastVariablePerLaneShuffle, TuningFastMOVBE, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, @@ -1184,11 +1247,9 @@ def ProcessorFeatures { FeaturePKU, FeatureVAES, FeatureVPCLMULQDQ]; - list ZN3AdditionalTuning = - [TuningMacroFusion, - TuningFastVariablePerLaneShuffle]; + list ZN3AdditionalTuning = [TuningMacroFusion]; list ZN3Tuning = - !listconcat(ZNTuning, ZN3AdditionalTuning); + !listconcat(ZN2Tuning, ZN3AdditionalTuning); list ZN3Features = !listconcat(ZN2Features, ZN3AdditionalFeatures); } @@ -1209,39 +1270,43 @@ class ProcModel; def : Proc<"i386", [FeatureX87], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : Proc<"i486", [FeatureX87], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B], +def : Proc<"i586", [FeatureX87, FeatureCX8], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B], +def : Proc<"pentium", [FeatureX87, FeatureCX8], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], +def : Proc<"pentium-mmx", [FeatureX87, FeatureCX8, FeatureMMX], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV], +def : Proc<"i686", [FeatureX87, FeatureCX8, FeatureCMOV], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, +def : Proc<"pentiumpro", [FeatureX87, FeatureCX8, FeatureCMOV, FeatureNOPL], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, +def : Proc<"pentium2", [FeatureX87, FeatureCX8, FeatureMMX, FeatureCMOV, FeatureFXSR, FeatureNOPL], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; foreach P = ["pentium3", "pentium3m"] in { - def : Proc; } @@ -1257,42 +1322,42 @@ foreach P = ["pentium3", "pentium3m"] in { // changes slightly. def : ProcModel<"pentium-m", GenericPostRAModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, + [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { def : ProcModel; } // Intel Quark. -def : Proc<"lakemont", [FeatureCMPXCHG8B], +def : Proc<"lakemont", [FeatureCX8], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; // Intel Core Duo. def : ProcModel<"yonah", SandyBridgeModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, FeatureCMOV], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; // NetBurst. def : ProcModel<"prescott", GenericPostRAModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, FeatureCMOV], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : ProcModel<"nocona", GenericPostRAModel, [ FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, + FeatureX86_64, + FeatureCX16, ], [ TuningSlowUAMem16, @@ -1302,15 +1367,15 @@ def : ProcModel<"nocona", GenericPostRAModel, [ // Intel Core 2 Solo/Duo. def : ProcModel<"core2", SandyBridgeModel, [ FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSSE3, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureLAHFSAHF + FeatureX86_64, + FeatureCX16, + FeatureLAHFSAHF64 ], [ TuningMacroFusion, @@ -1319,15 +1384,15 @@ def : ProcModel<"core2", SandyBridgeModel, [ ]>; def : ProcModel<"penryn", SandyBridgeModel, [ FeatureX87, - FeatureCMPXCHG8B, + FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE41, FeatureFXSR, FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureLAHFSAHF + FeatureX86_64, + FeatureCX16, + FeatureLAHFSAHF64 ], [ TuningMacroFusion, @@ -1416,38 +1481,38 @@ def : ProcModel<"alderlake", SkylakeClientModel, // AMD CPUs. -def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], +def : Proc<"k6", [FeatureX87, FeatureCX8, FeatureMMX], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], +def : Proc<"k6-2", [FeatureX87, FeatureCX8, Feature3DNow], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], +def : Proc<"k6-3", [FeatureX87, FeatureCX8, Feature3DNow], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { - def : Proc; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc; + TuningSBBDepBreaking, TuningInsertVZEROUPPER]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc; + TuningSBBDepBreaking, TuningInsertVZEROUPPER]>; } foreach P = ["amdfam10", "barcelona"] in { @@ -1482,7 +1547,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features, ProcessorFeatures.ZN3Tuning>; -def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA], +def : Proc<"geode", [FeatureX87, FeatureCX8, Feature3DNowA], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : Proc<"winchip-c6", [FeatureX87, FeatureMMX], @@ -1491,7 +1556,7 @@ def : Proc<"winchip2", [FeatureX87, Feature3DNow], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : Proc<"c3", [FeatureX87, Feature3DNow], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, +def : Proc<"c3-2", [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE1, FeatureFXSR, FeatureCMOV], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index d48b8e458219..c205395aa084 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -60,8 +61,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { SMShadowTracker.startFunction(MF); CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(), - MF.getContext())); + *Subtarget->getInstrInfo(), MF.getContext())); EmitFPOData = Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag(); @@ -70,12 +70,12 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (Subtarget->isTargetCOFF()) { bool Local = MF.getFunction().hasLocalLinkage(); - OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer->EmitCOFFSymbolStorageClass( + OutStreamer->beginCOFFSymbolDef(CurrentFnSym); + OutStreamer->emitCOFFSymbolStorageClass( Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL); - OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION - << COFF::SCT_COMPLEX_TYPE_SHIFT); - OutStreamer->EndCOFFSymbolDef(); + OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer->endCOFFSymbolDef(); } // Emit the rest of the function body. @@ -249,7 +249,7 @@ void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo, void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, const char *Modifier) { const MachineOperand &MO = MI->getOperand(OpNo); - if (!Modifier || MO.getType() != MachineOperand::MO_Register) + if (!Modifier || !MO.isReg()) return PrintOperand(MI, OpNo, O); if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) O << '%'; @@ -336,6 +336,37 @@ void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo, } } +static bool isSimpleReturn(const MachineInstr &MI) { + // We exclude all tail calls here which set both isReturn and isCall. + return MI.getDesc().isReturn() && !MI.getDesc().isCall(); +} + +static bool isIndirectBranchOrTailCall(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ || + Opc == X86::TAILJMPr || Opc == X86::TAILJMPm || + Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 || + Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi || + Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 || + Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX; +} + +void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) { + if (Subtarget->hardenSlsRet() || Subtarget->hardenSlsIJmp()) { + auto I = MBB.getLastNonDebugInstr(); + if (I != MBB.end()) { + if ((Subtarget->hardenSlsRet() && isSimpleReturn(*I)) || + (Subtarget->hardenSlsIJmp() && isIndirectBranchOrTailCall(*I))) { + MCInst TmpInst; + TmpInst.setOpcode(X86::INT3); + EmitToStreamer(*OutStreamer, TmpInst); + } + } + } + AsmPrinter::emitBasicBlockEnd(MBB); + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); +} + void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, const char *Modifier) { assert(isMem(*MI, OpNo) && "Invalid memory reference!"); @@ -363,6 +394,12 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, BaseReg.getReg() == X86::RIP) HasBaseReg = false; + // If we really just want to print out displacement. + if (Modifier && (DispSpec.isGlobal() || DispSpec.isSymbol()) && + !strcmp(Modifier, "disp-only")) { + HasBaseReg = false; + } + // If this has a segment register, print it. if (SegReg.getReg()) { PrintOperand(MI, OpNo + X86::AddrSegmentReg, O); @@ -606,11 +643,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, PrintMemReference(MI, OpNo, O, "H"); } return false; - case 'P': // Don't print @PLT, but do print as memory. + // Print memory only with displacement. The Modifer 'P' is used in inline + // asm to present a call symbol or a global symbol which can not use base + // reg or index reg. + case 'P': if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { - PrintIntelMemReference(MI, OpNo, O, "no-rip"); + PrintIntelMemReference(MI, OpNo, O, "disp-only"); } else { - PrintMemReference(MI, OpNo, O, "no-rip"); + PrintMemReference(MI, OpNo, O, "disp-only"); } return false; } @@ -641,7 +681,7 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) { MCSection *Cur = OutStreamer->getCurrentSectionOnly(); MCSection *Nt = MMI->getContext().getELFSection( ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); - OutStreamer->SwitchSection(Nt); + OutStreamer->switchSection(Nt); // Emitting note header. const int WordSize = TT.isArch64Bit() && !TT.isX32() ? 8 : 4; @@ -658,21 +698,21 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) { emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding OutStreamer->endSection(Nt); - OutStreamer->SwitchSection(Cur); + OutStreamer->switchSection(Cur); } } if (TT.isOSBinFormatMachO()) - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + OutStreamer->switchSection(getObjFileLowering().getTextSection()); if (TT.isOSBinFormatCOFF()) { // Emit an absolute @feat.00 symbol. This appears to be some kind of // compiler features bitfield read by link.exe. MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00")); - OutStreamer->BeginCOFFSymbolDef(S); - OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); - OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); - OutStreamer->EndCOFFSymbolDef(); + OutStreamer->beginCOFFSymbolDef(S); + OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); + OutStreamer->endCOFFSymbolDef(); int64_t Feat00Flags = 0; if (TT.getArch() == Triple::x86) { @@ -739,7 +779,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) { // Output stubs for external and common global variables. Stubs = MMIMacho.GetGVStubList(); if (!Stubs.empty()) { - OutStreamer.SwitchSection(MMI->getContext().getMachOSection( + OutStreamer.switchSection(MMI->getContext().getMachOSection( "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS, SectionKind::getMetadata())); @@ -747,7 +787,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) { emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second); Stubs.clear(); - OutStreamer.AddBlankLine(); + OutStreamer.addBlankLine(); } } @@ -795,6 +835,22 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) { emitStackMaps(SM); FM.serializeToFaultMapSection(); } + + // Emit __morestack address if needed for indirect calls. + if (TT.getArch() == Triple::x86_64 && TM.getCodeModel() == CodeModel::Large) { + if (MCSymbol *AddrSymbol = OutContext.lookupSymbol("__morestack_addr")) { + Align Alignment(1); + MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant( + getDataLayout(), SectionKind::getReadOnly(), + /*C=*/nullptr, Alignment); + OutStreamer->switchSection(ReadOnlySection); + OutStreamer->emitLabel(AddrSymbol); + + unsigned PtrSize = MAI->getCodePointerSize(); + OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("__morestack"), + PtrSize); + } + } } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index 94679e6e3d11..d53c26b729ef 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -131,10 +131,7 @@ public: void emitInstruction(const MachineInstr *MI) override; - void emitBasicBlockEnd(const MachineBasicBlock &MBB) override { - AsmPrinter::emitBasicBlockEnd(MBB); - SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); - } + void emitBasicBlockEnd(const MachineBasicBlock &MBB) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override; diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp index 0899783d5f60..2ecf49382d29 100644 --- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp +++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -35,6 +35,7 @@ #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #define AVOIDCALL_DESC "X86 avoid trailing call pass" @@ -69,8 +70,8 @@ INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, // A real instruction is a non-meta, non-pseudo instruction. Some pseudos // expand to nothing, and some expand to code. This logic conservatively assumes // they might expand to nothing. -static bool isRealInstruction(MachineInstr &MI) { - return !MI.isPseudo() && !MI.isMetaInstruction(); +static bool isCallOrRealInstruction(MachineInstr &MI) { + return MI.isCall() || (!MI.isPseudo() && !MI.isMetaInstruction()); } // Return true if this is a call instruction, but not a tail call. @@ -100,7 +101,7 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { continue; // Find the last real instruction in this block. - auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction); + auto LastRealInstr = llvm::find_if(reverse(MBB), isCallOrRealInstruction); // If the block is empty or the last real instruction is a call instruction, // insert an int3. If there is a call instruction, insert the int3 between diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index c80a5d5bb332..ded93fdc011c 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -299,7 +299,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State) { const MachineFunction &MF = State.getMachineFunction(); size_t ArgCount = State.getMachineFunction().getFunction().arg_size(); - bool Is64Bit = static_cast(MF.getSubtarget()).is64Bit(); + bool Is64Bit = MF.getSubtarget().is64Bit(); unsigned SlotSize = Is64Bit ? 8 : 4; unsigned Offset; if (ArgCount == 1 && ValNo == 0) { diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp index 96d3d1390a59..f32891552a82 100644 --- a/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -97,6 +97,11 @@ static cl::opt ForceMemOperand( cl::desc("Convert cmovs to branches whenever they have memory operands."), cl::init(true), cl::Hidden); +static cl::opt ForceAll( + "x86-cmov-converter-force-all", + cl::desc("Convert all cmovs to branches."), + cl::init(false), cl::Hidden); + namespace { /// Converts X86 cmov instructions into branches when profitable. @@ -174,11 +179,11 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { TSchedModel.init(&STI); // Before we handle the more subtle cases of register-register CMOVs inside - // of potentially hot loops, we want to quickly remove all CMOVs with - // a memory operand. The CMOV will risk a stall waiting for the load to - // complete that speculative execution behind a branch is better suited to - // handle on modern x86 chips. - if (ForceMemOperand) { + // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or + // the ones with a memory operand (ForceMemOperand option). The latter CMOV + // will risk a stall waiting for the load to complete that speculative + // execution behind a branch is better suited to handle on modern x86 chips. + if (ForceMemOperand || ForceAll) { CmovGroups AllCmovGroups; SmallVector Blocks; for (auto &MBB : MF) @@ -186,7 +191,8 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { for (auto &Group : AllCmovGroups) { // Skip any group that doesn't do at least one memory operand cmov. - if (llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) + if (ForceMemOperand && !ForceAll && + llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) continue; // For CMOV groups which we can rewrite and which contain a memory load, @@ -196,12 +202,15 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { convertCmovInstsToBranches(Group); } } + // Early return as ForceAll converts all CmovGroups. + if (ForceAll) + return Changed; } //===--------------------------------------------------------------------===// // Register-operand Conversion Algorithm // --------- - // For each inner most loop + // For each innermost loop // collectCmovCandidates() { // Find all CMOV-group-candidates. // } @@ -230,7 +239,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { Loops.push_back(Child); for (MachineLoop *CurrLoop : Loops) { - // Optimize only inner most loops. + // Optimize only innermost loops. if (!CurrLoop->getSubLoops().empty()) continue; @@ -520,7 +529,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( //===--------------------------------------------------------------------===// // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. // Worth-Optimize-Group: - // Iff it worths to optimize all CMOV instructions in the group. + // Iff it is worth to optimize all CMOV instructions in the group. // // Worth-Optimize-CMOV: // Predicted branch is faster than CMOV by the difference between depth of diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp index 2ff8ee19561b..29668f4b2761 100644 --- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp +++ b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -16,6 +16,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/ProfileData/SampleProf.h" @@ -159,7 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { } // Since we were able to encode, bump the MemOpDiscriminators. ++MemOpDiscriminators[L]; - DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue()); + DI = DI->cloneWithDiscriminator(*EncodedDiscriminator); assert(DI && "DI should not be nullptr"); updateDebugInfo(&MI, DI); Changed = true; diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 9826bf4bf861..9d4338deca35 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -15,6 +15,7 @@ #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/STLExtras.h" @@ -86,7 +87,7 @@ protected: public: InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {} - virtual ~InstrConverterBase() {} + virtual ~InstrConverterBase() = default; /// \returns true if \p MI is legal to convert. virtual bool isLegal(const MachineInstr *MI, @@ -374,7 +375,7 @@ class X86DomainReassignment : public MachineFunctionPass { const X86InstrInfo *TII = nullptr; /// All edges that are included in some closure - DenseSet EnclosedEdges; + BitVector EnclosedEdges{8, false}; /// All instructions that are included in some closure. DenseMap EnclosedInstrs; @@ -429,10 +430,10 @@ char X86DomainReassignment::ID = 0; void X86DomainReassignment::visitRegister(Closure &C, Register Reg, RegDomain &Domain, SmallVectorImpl &Worklist) { - if (EnclosedEdges.count(Reg)) + if (!Reg.isVirtual()) return; - if (!Reg.isVirtual()) + if (EnclosedEdges.test(Register::virtReg2Index(Reg))) return; if (!MRI->hasOneDef(Reg)) @@ -550,7 +551,7 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { // Register already in this closure. if (!C.insertEdge(CurReg)) continue; - EnclosedEdges.insert(Reg); + EnclosedEdges.set(Register::virtReg2Index(Reg)); MachineInstr *DefMI = MRI->getVRegDef(CurReg); encloseInstr(C, DefMI); @@ -742,6 +743,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; EnclosedEdges.clear(); + EnclosedEdges.resize(MRI->getNumVirtRegs()); EnclosedInstrs.clear(); std::vector Closures; @@ -756,7 +758,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // Register already in closure. - if (EnclosedEdges.count(Reg)) + if (EnclosedEdges.test(Idx)) continue; // Calculate closure starting with Reg. diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 6a047838f0b5..aebeec5a6d27 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -19,6 +19,7 @@ #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. @@ -552,7 +553,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case X86::PTILELOADDV: case X86::PTILELOADDT1V: { for (unsigned i = 2; i > 0; --i) - MI.RemoveOperand(i); + MI.removeOperand(i); unsigned Opc = Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1; MI.setDesc(TII->get(Opc)); @@ -565,7 +566,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case X86::PTDPBF16PSV: { MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) - MI.RemoveOperand(i); + MI.removeOperand(i); unsigned Opc; switch (Opcode) { case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; @@ -581,13 +582,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case X86::PTILESTOREDV: { for (int i = 1; i >= 0; --i) - MI.RemoveOperand(i); + MI.removeOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { for (int i = 2; i > 0; --i) // Remove row, col - MI.RemoveOperand(i); + MI.removeOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; } @@ -729,7 +730,7 @@ bool X86ExpandPseudo::ExpandPseudosWhichAffectControlFlow(MachineFunction &MF) { } bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - STI = &static_cast(MF.getSubtarget()); + STI = &MF.getSubtarget(); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); X86FI = MF.getInfo(); diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 1ac998b7ff7e..f2c362eeaa48 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -49,22 +49,11 @@ class X86FastISel final : public FastISel { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 - /// floating point ops. - /// When SSE is available, use it for f32 operations. - /// When SSE2 is available, use it for f64 operations. - bool X86ScalarSSEf64; - bool X86ScalarSSEf32; - bool X86ScalarSSEf16; - public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) : FastISel(funcInfo, libInfo) { Subtarget = &funcInfo.MF->getSubtarget(); - X86ScalarSSEf64 = Subtarget->hasSSE2(); - X86ScalarSSEf32 = Subtarget->hasSSE1(); - X86ScalarSSEf16 = Subtarget->hasFP16(); } bool fastSelectInstruction(const Instruction *I) override; @@ -158,9 +147,8 @@ private: /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { - return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 - (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 + return (VT == MVT::f64 && Subtarget->hasSSE2()) || + (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16; } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); @@ -292,6 +280,11 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, if (I->isTerminator() && llvm::any_of(successors(I), HasPhis)) return false; + // Make sure there are no potentially eflags clobbering constant + // materializations in between. + if (llvm::any_of(I->operands(), [](Value *V) { return isa(V); })) + return false; + CC = TmpCC; return true; } @@ -305,9 +298,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { VT = evt.getSimpleVT(); // For now, require SSE/SSE2 for performing floating-point operations, // since x87 requires additional work. - if (VT == MVT::f64 && !X86ScalarSSEf64) + if (VT == MVT::f64 && !Subtarget->hasSSE2()) return false; - if (VT == MVT::f32 && !X86ScalarSSEf32) + if (VT == MVT::f32 && !Subtarget->hasSSE1()) return false; // Similarly, no f80 support yet. if (VT == MVT::f80) @@ -325,6 +318,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); @@ -354,20 +349,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, Opc = X86::MOV64rm; break; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::VMOVSSZrm_alt : - HasAVX ? X86::VMOVSSrm_alt : - X86::MOVSSrm_alt; - else - Opc = X86::LD_Fp32m; + Opc = HasAVX512 ? X86::VMOVSSZrm_alt + : HasAVX ? X86::VMOVSSrm_alt + : HasSSE1 ? X86::MOVSSrm_alt + : X86::LD_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::VMOVSDZrm_alt : - HasAVX ? X86::VMOVSDrm_alt : - X86::MOVSDrm_alt; - else - Opc = X86::LD_Fp64m; + Opc = HasAVX512 ? X86::VMOVSDZrm_alt + : HasAVX ? X86::VMOVSDrm_alt + : HasSSE2 ? X86::MOVSDrm_alt + : X86::LD_Fp64m; break; case MVT::f80: // No f80 support yet. @@ -521,7 +512,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM, Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; break; case MVT::f32: - if (X86ScalarSSEf32) { + if (HasSSE1) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else @@ -531,7 +522,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM, Opc = X86::ST_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf32) { + if (HasSSE2) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else @@ -1362,8 +1353,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { bool HasAVX512 = Subtarget->hasAVX512(); bool HasAVX = Subtarget->hasAVX(); - bool X86ScalarSSEf32 = Subtarget->hasSSE1(); - bool X86ScalarSSEf64 = Subtarget->hasSSE2(); + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); switch (VT.getSimpleVT().SimpleTy) { default: return 0; @@ -1372,15 +1363,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { case MVT::i32: return X86::CMP32rr; case MVT::i64: return X86::CMP64rr; case MVT::f32: - return X86ScalarSSEf32 - ? (HasAVX512 ? X86::VUCOMISSZrr - : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) - : 0; + return HasAVX512 ? X86::VUCOMISSZrr + : HasAVX ? X86::VUCOMISSrr + : HasSSE1 ? X86::UCOMISSrr + : 0; case MVT::f64: - return X86ScalarSSEf64 - ? (HasAVX512 ? X86::VUCOMISDZrr - : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) - : 0; + return HasAVX512 ? X86::VUCOMISDZrr + : HasAVX ? X86::VUCOMISDrr + : HasSSE2 ? X86::UCOMISDrr + : 0; } } @@ -2036,7 +2027,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { /// the select. bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // Check if the subtarget supports these instructions. - if (!Subtarget->hasCMov()) + if (!Subtarget->canUseCMOV()) return false; // FIXME: Add support for i8. @@ -2289,12 +2280,13 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { default: return false; case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; - case MVT::f16: Opc = X86::CMOV_FR16X; break; case MVT::i32: Opc = X86::CMOV_GR32; break; - case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X - : X86::CMOV_FR32; break; - case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X - : X86::CMOV_FR64; break; + case MVT::f16: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break; + case MVT::f32: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break; + case MVT::f64: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); @@ -2495,7 +2487,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, } bool X86FastISel::X86SelectFPExt(const Instruction *I) { - if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && + if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() && I->getOperand(0)->getType()->isFloatTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fpext from float to double. @@ -2509,7 +2501,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { - if (X86ScalarSSEf64 && I->getType()->isFloatTy() && + if (Subtarget->hasSSE2() && I->getType()->isFloatTy() && I->getOperand(0)->getType()->isDoubleTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fptrunc from double to float. @@ -3733,25 +3725,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX512 = Subtarget->hasAVX512(); switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::VMOVSSZrm_alt : - HasAVX ? X86::VMOVSSrm_alt : - X86::MOVSSrm_alt; - else - Opc = X86::LD_Fp32m; + Opc = HasAVX512 ? X86::VMOVSSZrm_alt + : HasAVX ? X86::VMOVSSrm_alt + : HasSSE1 ? X86::MOVSSrm_alt + : X86::LD_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::VMOVSDZrm_alt : - HasAVX ? X86::VMOVSDrm_alt : - X86::MOVSDrm_alt; - else - Opc = X86::LD_Fp64m; + Opc = HasAVX512 ? X86::VMOVSDZrm_alt + : HasAVX ? X86::VMOVSDrm_alt + : HasSSE2 ? X86::MOVSDrm_alt + : X86::LD_Fp64m; break; case MVT::f80: // No f80 support yet. @@ -3852,11 +3842,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { default: break; case MVT::f32: - if (!X86ScalarSSEf32) + if (!Subtarget->hasSSE1()) Opc = X86::LD_Fp032; break; case MVT::f64: - if (!X86ScalarSSEf64) + if (!Subtarget->hasSSE2()) Opc = X86::LD_Fp064; break; case MVT::f80: @@ -3907,21 +3897,24 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { return 0; // Get opcode and regclass for the given zero. + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; switch (VT.SimpleTy) { default: return 0; + case MVT::f16: + Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH; + break; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; - else - Opc = X86::LD_Fp032; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SS + : HasSSE1 ? X86::FsFLD0SS + : X86::LD_Fp032; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; - else - Opc = X86::LD_Fp064; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SD + : HasSSE2 ? X86::FsFLD0SD + : X86::LD_Fp064; break; case MVT::f80: // No f80 support yet. diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp new file mode 100644 index 000000000000..7e5540022cc8 --- /dev/null +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -0,0 +1,709 @@ +//===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to preconfig the shape of physical tile registers +/// It inserts ldtilecfg ahead of each group of tile registers. The algorithm +/// walk each instruction of basic block in reverse order. All the tile +/// registers that live out the basic block would be spilled and reloaded +/// before its user. It also check the depenedency of the shape to ensure +/// the shape is defined before ldtilecfg. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "fastpretileconfig" + +STATISTIC(NumStores, "Number of stores added"); +STATISTIC(NumLoads, "Number of loads added"); + +namespace { + +class X86FastPreTileConfig : public MachineFunctionPass { + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + X86MachineFunctionInfo *X86FI = nullptr; + MachineFrameInfo *MFI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineBasicBlock *MBB = nullptr; + int CfgSS = -1; + struct PHIInfo { + Register Row; + Register Col; + Register StackAddr; + }; + DenseMap VisitedPHIs; + + /// Maps virtual regs to the frame index where these values are spilled. + IndexedMap StackSlotForVirtReg; + + /// Has a bit set for tile virtual register for which it was determined + /// that it is alive across blocks. + BitVector MayLiveAcrossBlocks; + + int getStackSpaceFor(Register VirtReg); + void InitializeTileConfigStackSpace(); + bool mayLiveOut(Register VirtReg, MachineInstr *CfgMI); + void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill); + void reload(MachineBasicBlock::iterator UseMI, Register VirtReg, + MachineOperand *RowMO, MachineOperand *ColMO); + void canonicalizePHIs(MachineBasicBlock &MBB); + void convertPHI(MachineBasicBlock *MBB, MachineInstr &PHI); + void convertPHIs(MachineBasicBlock &MBB); + bool configBasicBlock(MachineBasicBlock &MBB); + +public: + X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + + /// Return the pass name. + StringRef getPassName() const override { + return "Fast Tile Register Preconfigure"; + } + + /// Perform tile register configure. + bool runOnMachineFunction(MachineFunction &MFunc) override; + + static char ID; +}; + +} // end anonymous namespace + +char X86FastPreTileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) +INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) + +static bool dominates(MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + auto MBBEnd = MBB.end(); + if (B == MBBEnd) + return true; + + MachineBasicBlock::const_iterator I = MBB.begin(); + for (; &*I != A && &*I != B; ++I) + ; + + return &*I == A; +} + +/// This allocates space for the specified virtual register to be held on the +/// stack. +int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) { + // Find the location Reg would belong... + int SS = StackSlotForVirtReg[VirtReg]; + // Already has space allocated? + if (SS != -1) + return SS; + + // Allocate a new stack object for this spill location... + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + unsigned Size = TRI->getSpillSize(RC); + Align Alignment = TRI->getSpillAlign(RC); + int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment); + + // Assign the slot. + StackSlotForVirtReg[VirtReg] = FrameIdx; + return FrameIdx; +} + +/// Returns false if \p VirtReg is known to not live out of the current config. +/// If \p VirtReg live out of the current MBB, it must live out of the current +/// config +bool X86FastPreTileConfig::mayLiveOut(Register VirtReg, MachineInstr *CfgMI) { + if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) + return true; + + for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) { + if (UseInst.getParent() != MBB) { + MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg)); + return true; + } + + // The use and def are in the same MBB. If the tile register is + // reconfigured, it is crobbered and we need to spill and reload + // tile register. + if (CfgMI) { + if (dominates(*MBB, *CfgMI, UseInst)) { + MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg)); + return true; + } + } + } + + return false; +} + +void X86FastPreTileConfig::InitializeTileConfigStackSpace() { + MachineBasicBlock &MBB = MF->front(); + MachineInstr *MI = &*MBB.getFirstNonPHI(); + DebugLoc DL; + if (ST->hasAVX512()) { + Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS) + .addReg(Zmm); + } else if (ST->hasAVX2()) { + Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS) + .addReg(Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS, + 32) + .addReg(Ymm); + } else { + assert(ST->hasSSE2() && "AMX should assume SSE2 enabled"); + unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48) + .addReg(Xmm); + } + // Fill in the palette first. + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS) + .addImm(1); +} + +/// Insert spill instruction for \p AssignedReg before \p Before. +/// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot. +void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, + Register VirtReg, bool Kill) { + LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n"); + int FI = getStackSpaceFor(VirtReg); + LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n'); + + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + // Don't need shape information for tile store, becasue it is adjacent to + // the tile def instruction. + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI); + ++NumStores; + + // TODO: update DBG_VALUEs +} + +/// Insert reload instruction for \p PhysReg before \p Before. +void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, + Register OrigReg, MachineOperand *RowMO, + MachineOperand *ColMO) { + int FI = getStackSpaceFor(OrigReg); + const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg); + Register TileReg; + // Fold copy to tileload + // BB1: + // spill src to s + // + // BB2: + // t = copy src + // --> + // t = tileload (s) + if (UseMI->isCopy()) + TileReg = UseMI->getOperand(0).getReg(); + else + TileReg = MRI->createVirtualRegister(&RC); + // Can't use TII->loadRegFromStackSlot(), because we need the shape + // information for reload. + // tileloadd (%sp, %idx), %tmm + unsigned Opc = X86::PTILELOADDV; + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + // FIXME: MBB is not the parent of UseMI. + MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), + TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + NewMI = addFrameReference( + BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg) + .addReg(RowMO->getReg()) + .addReg(ColMO->getReg()), + FI); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); + RowMO->setIsKill(false); + ColMO->setIsKill(false); + // Erase copy instruction after it is folded. + if (UseMI->isCopy()) { + UseMI->eraseFromParent(); + } else { + // Replace the register in the user MI. + for (auto &MO : UseMI->operands()) { + if (MO.isReg() && MO.getReg() == OrigReg) + MO.setReg(TileReg); + } + } + + ++NumLoads; + LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into " + << printReg(TileReg, TRI) << '\n'); +} + +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // The instruction must have 3 operands: tile def, row, col. + if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo()) + return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } + + return false; +} + +static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { + MachineInstr *MI = MRI->getVRegDef(TileReg); + if (isTileDef(MRI, *MI)) { + MachineOperand *RowMO = &MI->getOperand(1); + MachineOperand *ColMO = &MI->getOperand(2); + return ShapeT(RowMO, ColMO, MRI); + } else if (MI->isCopy()) { + TileReg = MI->getOperand(1).getReg(); + return getShape(MRI, TileReg); + } + + // The def should not be PHI node, because we walk the MBB in reverse post + // order. + assert(MI->isPHI() && "Unexpected PHI when get shape."); + llvm_unreachable("Unexpected MI when get shape."); +} + +// BB0: +// spill t0 to s0 +// BB1: +// spill t1 to s1 +// +// BB2: +// t = phi [t0, bb0] [t1, bb1] +// --> +// row = phi [r0, bb0] [r1, bb1] +// col = phi [c0, bb0] [c1, bb1] +// s = phi [s0, bb0] [s1, bb1] +// t = tileload row, col, s +// The new instruction is inserted at the end of the phi node. The order +// of the original phi node is not ensured. +void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB, + MachineInstr &PHI) { + // 1. Create instruction to get stack slot address of each incoming block. + // 2. Create PHI node for the stack address. + // 3. Create PHI node for shape. If one of the incoming shape is immediate + // use the immediate and delete the PHI node. + // 4. Create tileload instruction from the stack address. + Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), StackAddrReg); + Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), RowReg); + Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), ColReg); + // Record the mapping of phi node and its row/column information. + VisitedPHIs[&PHI] = {RowReg, ColReg, StackAddrReg}; + + for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) { + // Get the 2 incoming value of tile register and MBB. + Register InTileReg = PHI.getOperand(I).getReg(); + // Mark it as liveout, so that it will be spilled when visit + // the incoming MBB. Otherwise since phi will be deleted, it + // would miss spill when visit incoming MBB. + MayLiveAcrossBlocks.set(Register::virtReg2Index(InTileReg)); + MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB(); + + MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg); + MachineBasicBlock::iterator InsertPos; + if (TileDefMI->isPHI()) { + InsertPos = TileDefMI->getParent()->getFirstNonPHI(); + if (VisitedPHIs.count(TileDefMI)) { // circular phi reference + // def t1 + // / \ + // def t2 t3 = phi(t1, t4) <-- + // \ / | + // t4 = phi(t2, t3)------------- + // + // For each (row, column and stack address) append phi incoming value. + // Create r3 = phi(r1, r4) + // Create r4 = phi(r2, r3) + Register InRowReg = VisitedPHIs[TileDefMI].Row; + Register InColReg = VisitedPHIs[TileDefMI].Col; + Register InStackAddrReg = VisitedPHIs[TileDefMI].StackAddr; + RowPHI.addReg(InRowReg).addMBB(InMBB); + ColPHI.addReg(InColReg).addMBB(InMBB); + AddrPHI.addReg(InStackAddrReg).addMBB(InMBB); + continue; + } else { + // Recursively convert PHI to tileload + convertPHI(TileDefMI->getParent(), *TileDefMI); + // The PHI node is coverted to tileload instruction. Get the stack + // address from tileload operands. + MachineInstr *TileLoad = MRI->getVRegDef(InTileReg); + assert(TileLoad && TileLoad->getOpcode() == X86::PTILELOADDV); + Register InRowReg = TileLoad->getOperand(1).getReg(); + Register InColReg = TileLoad->getOperand(2).getReg(); + Register InStackAddrReg = TileLoad->getOperand(3).getReg(); + RowPHI.addReg(InRowReg).addMBB(InMBB); + ColPHI.addReg(InColReg).addMBB(InMBB); + AddrPHI.addReg(InStackAddrReg).addMBB(InMBB); + } + } else { + InsertPos = TileDefMI->getIterator(); + + // Fill the incoming operand of row/column phi instruction. + ShapeT Shape = getShape(MRI, InTileReg); + Shape.getRow()->setIsKill(false); + Shape.getCol()->setIsKill(false); + RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB); + ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB); + + // The incoming tile register live out of its def BB, it would be spilled. + // Create MI to get the spill stack slot address for the tile register + int FI = getStackSpaceFor(InTileReg); + Register InStackAddrReg = + MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + addOffset(BuildMI(*TileDefMI->getParent(), InsertPos, DebugLoc(), + TII->get(X86::LEA64r), InStackAddrReg) + .addFrameIndex(FI), + 0); + AddrPHI.addReg(InStackAddrReg).addMBB(InMBB); + } + } + + MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI(); + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + Register TileReg = PHI.getOperand(0).getReg(); + MachineInstr *NewMI = addDirectMem( + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg) + .addReg(RowReg) + .addReg(ColReg), + StackAddrReg); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); + PHI.eraseFromParent(); + VisitedPHIs.erase(&PHI); +} + +static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + MachineOperand &MO = MI.getOperand(0); + if (MO.isReg() && MO.getReg().isVirtual() && + MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + return true; + return false; +} + +void X86FastPreTileConfig::canonicalizePHIs(MachineBasicBlock &MBB) { + SmallVector PHIs; + + for (MachineInstr &MI : MBB) { + if (!MI.isPHI()) + break; + if (!isTileRegDef(MRI, MI)) + continue; + PHIs.push_back(&MI); + } + // Canonicalize the phi node first. One tile phi may depeneds previous + // phi node. For below case, we need convert %t4. + // + // BB0: + // %t3 = phi (t1 BB1, t2 BB0) + // %t4 = phi (t5 BB1, t3 BB0) + // --> + // %t3 = phi (t1 BB1, t2 BB0) + // %t4 = phi (t5 BB1, t2 BB0) + // + while (!PHIs.empty()) { + MachineInstr *PHI = PHIs.pop_back_val(); + + // Find the operand that is incoming from the same MBB and the def + // is also phi node. + MachineOperand *InMO = nullptr; + MachineInstr *DefMI = nullptr; + for (unsigned I = 1, E = PHI->getNumOperands(); I != E; I += 2) { + Register InTileReg = PHI->getOperand(I).getReg(); + MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB(); + DefMI = MRI->getVRegDef(InTileReg); + if (InMBB != &MBB || !DefMI->isPHI()) + continue; + + InMO = &PHI->getOperand(I); + break; + } + // If can't find such operand, do nothing. + if (!InMO) + continue; + + // Current phi node depends on previous phi node. Break the + // dependency. + Register DefTileReg; + for (unsigned I = 1, E = DefMI->getNumOperands(); I != E; I += 2) { + MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB(); + if (InMBB != &MBB) + continue; + DefTileReg = DefMI->getOperand(I).getReg(); + InMO->setReg(DefTileReg); + break; + } + } +} + +void X86FastPreTileConfig::convertPHIs(MachineBasicBlock &MBB) { + SmallVector PHIs; + for (MachineInstr &MI : MBB) { + if (!MI.isPHI()) + break; + if (!isTileRegDef(MRI, MI)) + continue; + PHIs.push_back(&MI); + } + while (!PHIs.empty()) { + MachineInstr *MI = PHIs.pop_back_val(); + VisitedPHIs.clear(); + convertPHI(&MBB, *MI); + } +} + +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + this->MBB = &MBB; + bool Change = false; + MachineInstr *LastShapeMI = nullptr; + MachineInstr *LastTileCfg = nullptr; + bool HasUnconfigTile = false; + + auto Config = [&](MachineInstr &Before) { + if (CfgSS == -1) + CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(), + ST->getTileConfigAlignment(), false); + LastTileCfg = addFrameReference( + BuildMI(MBB, Before, DebugLoc(), TII->get(X86::PLDTILECFGV)), CfgSS); + LastShapeMI = nullptr; + Change = true; + }; + auto HasTileOperand = [](MachineRegisterInfo *MRI, MachineInstr &MI) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + } + return false; + }; + for (MachineInstr &MI : reverse(MBB)) { + // We have transformed phi node before configuring BB. + if (MI.isPHI()) + break; + // Don't collect the shape of used tile, the tile should be defined + // before the tile use. Spill and reload would happen if there is only + // tile use after ldtilecfg, so the shape can be collected from reload. + // Take below code for example. %t would be reloaded before tilestore + // call + // .... + // tilestore %r, %c, %t + // --> + // call + // ldtilecfg + // %t = tileload %r, %c + // tilestore %r, %c, %t + if (HasTileOperand(MRI, MI)) + HasUnconfigTile = true; + // According to AMX ABI, all the tile registers including config register + // are volatile. Caller need to save/restore config register. + if (MI.isCall() && HasUnconfigTile) { + MachineBasicBlock::iterator I; + if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) + I = ++LastShapeMI->getIterator(); + else + I = ++MI.getIterator(); + Config(*I); + HasUnconfigTile = false; + continue; + } + if (!isTileDef(MRI, MI)) + continue; + // + //--------------------------------------------------------------------- + // Don't handle COPY instruction. If the src and dst of the COPY can be + // in the same config in below case, we just check the shape of t0. + // def row0 + // def col0 + // ldtilecfg + // t0 = tielzero(row0, col0) + // t1 = copy t0 + // ... + // If the src and dst of the COPY can NOT be in the same config in below + // case. Reload would be generated befor the copy instruction. + // def row0 + // def col0 + // t0 = tielzero(row0, col0) + // spill t0 + // ... + // def row1 + // def col1 + // ldtilecfg + // t1 = tilezero(row1, col1) + // reload t0 + // t1 = copy t0 + //--------------------------------------------------------------------- + // + // If MI dominate the last shape def instruction, we need insert + // ldtilecfg after LastShapeMI now. The config doesn't include + // current MI. + // def row0 + // def col0 + // tilezero(row0, col0) <- MI + // def row1 + // def col1 + // ldtilecfg <- insert + // tilezero(row1, col1) + if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) + Config(*(++LastShapeMI->getIterator())); + MachineOperand *RowMO = &MI.getOperand(1); + MachineOperand *ColMO = &MI.getOperand(2); + MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg()); + MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg()); + // If the shape is defined in current MBB, check the domination. + // FIXME how about loop? + if (RowMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = RowMI; + else if (dominates(MBB, LastShapeMI, RowMI)) + LastShapeMI = RowMI; + } + if (ColMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = ColMI; + else if (dominates(MBB, LastShapeMI, ColMI)) + LastShapeMI = ColMI; + } + // If there is user live out of the tilecfg, spill it and reload in + // before the user. + Register TileReg = MI.getOperand(0).getReg(); + if (mayLiveOut(TileReg, LastTileCfg)) + spill(++MI.getIterator(), TileReg, false); + for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) { + if (UseMI.getParent() == &MBB) { + // check user should not across ldtilecfg + if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI)) + continue; + // reload befor UseMI + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } else { + // Don't reload for phi instruction, we handle phi reload separately. + // TODO: merge the reload for the same user MBB. + if (!UseMI.isPHI()) + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } + } + } + + // Configure tile registers at the head of the MBB + if (HasUnconfigTile) { + MachineInstr *Before; + if (LastShapeMI == nullptr || LastShapeMI->isPHI()) + Before = &*MBB.getFirstNonPHI(); + else + Before = &*(++LastShapeMI->getIterator()); + + Config(*Before); + } + + return Change; +} + +bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + MF = &MFunc; + MRI = &MFunc.getRegInfo(); + ST = &MFunc.getSubtarget(); + TII = ST->getInstrInfo(); + X86FI = MFunc.getInfo(); + MFI = &MFunc.getFrameInfo(); + TRI = ST->getRegisterInfo(); + CfgSS = -1; + + unsigned NumVirtRegs = MRI->getNumVirtRegs(); + // Abandon early if there is no tile register to config. + bool HasVirtTileReg = false; + for (unsigned I = 0, E = NumVirtRegs; I != E; ++I) { + Register VirtReg = Register::index2VirtReg(I); + if (MRI->getRegClass(VirtReg)->getID() == X86::TILERegClassID) { + HasVirtTileReg = true; + break; + } + } + if (!HasVirtTileReg) + return false; + + StackSlotForVirtReg.resize(NumVirtRegs); + MayLiveAcrossBlocks.clear(); + // We will create register during config. *3 is to make sure + // the virtual register number doesn't exceed the size of + // the bit vector. + MayLiveAcrossBlocks.resize(NumVirtRegs * 3); + bool Change = false; + assert(MRI->isSSA()); + + // Canonicalize the phi node first. + for (MachineBasicBlock &MBB : MFunc) + canonicalizePHIs(MBB); + + // Loop over all of the basic blocks in reverse post order and insert + // ldtilecfg for tile registers. The reserse post order is to facilitate + // PHI node convert. + ReversePostOrderTraversal RPOT(MF); + for (MachineBasicBlock *MBB : RPOT) { + convertPHIs(*MBB); + Change |= configBasicBlock(*MBB); + } + + if (Change) + InitializeTileConfigStackSpace(); + + StackSlotForVirtReg.clear(); + return Change; +} + +FunctionPass *llvm::createX86FastPreTileConfigPass() { + return new X86FastPreTileConfig(); +} diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 061fff50bcea..2a20cd13791d 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -40,40 +40,25 @@ namespace { class X86FastTileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI = nullptr; const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; X86MachineFunctionInfo *X86FI = nullptr; - MachineInstr *getTileConfigPoint(); - void tileConfig(); + bool configBasicBlock(MachineBasicBlock &MBB); public: X86FastTileConfig() : MachineFunctionPass(ID) {} - bool fastTileConfig(); - bool isTileLoad(MachineInstr &MI); - bool isTileStore(MachineInstr &MI); - bool isAMXInstr(MachineInstr &MI); - - MachineInstr *getKeyAMXInstr(MachineInstr *MI); - void getTileShapesCfg(MachineInstr *MI, - SmallVector &ShapedTiles); - void getShapeCfgInstrs(MachineInstr *MI, - std::map &RowCfgs, - std::map &ColCfgs); - /// Return the pass name. StringRef getPassName() const override { return "Fast Tile Register Configure"; } - void materializeTileCfg(MachineInstr *MI); - - void rewriteTileCfg(SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } /// Perform register allocation. bool runOnMachineFunction(MachineFunction &MFunc) override; @@ -95,209 +80,107 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static bool isTilePhysReg(MachineOperand &Op) { - if (!Op.isReg()) +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // There is no phi instruction after register allocation. + assert(MI.isPHI() == false); + // The instruction must have 3 operands: tile def, row, col. + // It should be AMX pseudo instruction that have shape operand. + if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || + !MI.isPseudo()) return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } - Register Reg = Op.getReg(); - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; return false; } -static unsigned getTilePhysRegIdx(MachineOperand *Op) { - assert(isTilePhysReg(*Op) && "Tile Operand is invalid"); - return Op->getReg() - X86::TMM0; -} - -static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 48 + TIdx; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 16 + TIdx * 2; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV || - MI.getOpcode() == X86::PTILELOADDT1V; -} -bool X86FastTileConfig::isTileStore(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILESTOREDV; -} -bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { - // TODO: May need to handle some special nontile amx instrucion. - if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr()) - return false; - - return llvm::any_of(MI.operands(), isTilePhysReg); -} - -MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - MachineInstr *KeyMI = nullptr; - int KeyAMXNum = 0; - - for (auto II = Cfg; II != MBB->end(); II++) { - if (isTileLoad(*II)) { - KeyMI = &*II; +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + bool Change = false; + SmallVector, 6> ShapeInfos; + for (MachineInstr &MI : reverse(MBB)) { + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) continue; + // AMX instructions that define tile register. + if (MI.getOpcode() != X86::PLDTILECFGV) { + MachineOperand &Row = MI.getOperand(1); + MachineOperand &Col = MI.getOperand(2); + unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0; + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); + } else { // PLDTILECFGV + // Rewrite the shape information to memory. Stack slot should have + // been initialized to zero in pre config. + int SS = MI.getOperand(0).getIndex(); // tile config stack slot. + for (auto &ShapeInfo : ShapeInfos) { + DebugLoc DL; + unsigned TMMIdx = ShapeInfo.first; + Register RowReg = ShapeInfo.second.getRow()->getReg(); + Register ColReg = ShapeInfo.second.getCol()->getReg(); + // Here is the data format for the tile config. + // 0 palette + // 1 start_row + // 2-15 reserved, must be zero + // 16-17 tile0.colsb Tile 0 bytes per row. + // 18-19 tile1.colsb Tile 1 bytes per row. + // 20-21 tile2.colsb Tile 2 bytes per row. + // ... (sequence continues) + // 30-31 tile7.colsb Tile 7 bytes per row. + // 32-47 reserved, must be zero + // 48 tile0.rows Tile 0 rows. + // 49 tile1.rows Tile 1 rows. + // 50 tile2.rows Tile 2 rows. + // ... (sequence continues) + // 55 tile7.rows Tile 7 rows. + // 56-63 reserved, must be zero + int RowOffset = 48 + TMMIdx; + int ColOffset = 16 + TMMIdx * 2; + + Register SubRowReg = TRI->getSubReg(RowReg, X86::sub_8bit); + BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), SubRowReg); + MachineInstrBuilder StoreRow = + BuildMI(MBB, MI, DL, TII->get(X86::MOV8mr)); + addFrameReference(StoreRow, SS, RowOffset).addReg(SubRowReg); + + MachineInstrBuilder StoreCol = + BuildMI(MBB, MI, DL, TII->get(X86::MOV16mr)); + addFrameReference(StoreCol, SS, ColOffset).addReg(ColReg); + } + ShapeInfos.clear(); + Change = true; } - - if (isTileStore(*II)) { - assert(KeyMI && "Key AMX Should be found before!"); - break; - } - - if (isAMXInstr(*II)) { - assert((KeyAMXNum == 0) && "Too many Key AMX instruction!"); - KeyAMXNum++; - KeyMI = &*II; - } - } - assert(KeyMI && "There must be an AMX instruction."); - return KeyMI; -} - -// Orderly get the tiles in key amx instruction, uses before defs. -void X86FastTileConfig::getTileShapesCfg( - MachineInstr *CfgMI, SmallVector &ShapedTiles) { - MachineInstr *KeyMI = getKeyAMXInstr(CfgMI); - - SmallVector DefTiles; - for (MachineOperand &MO : KeyMI->operands()) { - if (!isTilePhysReg(MO)) - continue; - if (MO.isDef()) - DefTiles.push_back(&MO); - else - ShapedTiles.push_back(&MO); - } - ShapedTiles.append(DefTiles); -} - -// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and -// amx.shape.N.col*" at pass "Pre AMX Tile Config". -// The 'N' implies the order of tiles in key amx intrinsic. -void X86FastTileConfig::getShapeCfgInstrs( - MachineInstr *MI, std::map &RowCfgs, - std::map &ColCfgs) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - - for (auto II = Cfg; II != MBB->begin(); II--) { - if (isAMXInstr(*II) || II->isTerminator() || II->isCall()) - break; - if (!II->mayStore() || !II->hasOneMemOperand()) - continue; - const Value *MemPtr = II->memoperands()[0]->getValue(); - if (!MemPtr) - continue; - - StringRef Name = MemPtr->getName(); - if (!Name.startswith("amx.tmm.")) - continue; - - // Get the 'N'th tile shape config in key amx instruction. - auto N = Name.find(".shape"); - StringRef STileIdx = Name.slice(8, N); - unsigned Idx; - STileIdx.getAsInteger(10, Idx); - - // And related them with their store instructions. - if (Name.contains("row")) - RowCfgs[Idx] = &*II; - else if (Name.contains("col")) - ColCfgs[Idx] = &*II; - else - llvm_unreachable("Invalid tile shape info!"); } - assert((RowCfgs.size() == ColCfgs.size()) && - "The number of tile row and col must be equal!"); -} - -// Here is the data format for the tile config. -// 0 palette = 1 now. -// 1 start_row = 0 now. -// 2-15 reserved, must be zero -// 16-17 tile0.colsb Tile 0 bytes per row. -// 18-19 tile1.colsb Tile 1 bytes per row. -// 20-21 tile2.colsb Tile 2 bytes per row. -// ... (sequence continues) -// 30-31 tile7.colsb Tile 7 bytes per row. -// 32-47 reserved, must be zero -// 48 tile0.rows Tile 0 rows. -// 49 tile1.rows Tile 1 rows. -// 50 tile2.rows Tile 2 rows. -// ... (sequence continues) -// 55 tile7.rows Tile 7 rows. -// 56-63 reserved, must be zero -void X86FastTileConfig::rewriteTileCfg( - SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs) { - assert((RowCfgs.size() == ShapedTiles.size()) && - "The number of tile shapes not equal with the number of tiles!"); - // Orderly get the tiles and adjust the shape config. - for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) { - MachineOperand *MO = ShapedTiles[I]; - unsigned TmmIdx = getTilePhysRegIdx(MO); - if (I == TmmIdx) - continue; - adjustRowCfg(TmmIdx, RowCfgs[I]); - adjustColCfg(TmmIdx, ColCfgs[I]); - } -} - -// We have already preconfig the shapes before fast register allocation at -// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register -// allocation, the shapes pre-written before may not rightly corresponding -// to the correct tmm registers, so we need adjust them. -void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) { - SmallVector ShapedTiles; - std::map RowCfgs; - std::map ColCfgs; - - // Orderly keep the tile uses and def in ShapedTiles; - getTileShapesCfg(CfgMI, ShapedTiles); - assert(ShapedTiles.size() && "Not find shapes config!"); - - getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs); - - rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs); -} - -bool X86FastTileConfig::fastTileConfig() { - bool Changed = false; - - for (MachineBasicBlock &MBB : *MF) { - SmallVector CFGs; - for (MachineInstr &MI : MBB) - if (MI.getOpcode() == X86::PLDTILECFGV) - CFGs.push_back(&MI); - for (auto *MI : CFGs) - materializeTileCfg(MI); - if (!CFGs.empty()) - Changed = true; - } - if (Changed) + if (Change) X86FI->setHasVirtualTileReg(true); - return Changed; + + return Change; } bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { MF = &MFunc; MRI = &MFunc.getRegInfo(); - ST = &MFunc.getSubtarget(); + const TargetSubtargetInfo *ST = &MFunc.getSubtarget(); TRI = ST->getRegisterInfo(); TII = MFunc.getSubtarget().getInstrInfo(); X86FI = MFunc.getInfo(); + bool Change = false; + + // Loop over all of the basic blocks, eliminating virtual register references + for (MachineBasicBlock &MBB : MFunc) + Change |= configBasicBlock(MBB); - return fastTileConfig(); + return Change; } FunctionPass *llvm::createX86FastTileConfigPass() { diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp index 4730b936ec1f..b01145809ac6 100644 --- a/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -229,7 +229,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); bool IsSlowLEA = ST.slowLEA(); bool IsSlow3OpsLEA = ST.slow3OpsLEA(); - bool LEAUsesAG = ST.LEAusesAG(); + bool LEAUsesAG = ST.leaUsesAG(); bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize(); bool UseLEAForSP = ST.useLeaForSP(); @@ -546,7 +546,6 @@ bool FixupLEAPass::optLEAALU(MachineBasicBlock::iterator &I, if (KilledIndex) KilledIndex->setIsKill(false); - MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI1, 1); MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI2, 1); MBB.erase(I); MBB.erase(AluI); diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index 2f0ab4ca9de4..33f5bb365da8 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -99,17 +99,17 @@ namespace { // but the exact mapping of FP registers to stack slots is fixed later. struct LiveBundle { // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. - unsigned Mask; + unsigned Mask = 0; // Number of pre-assigned live registers in FixStack. This is 0 when the // stack order has not yet been fixed. - unsigned FixCount; + unsigned FixCount = 0; // Assigned stack order for live-in registers. // FixStack[i] == getStackEntry(i) for all i < FixCount. unsigned char FixStack[8]; - LiveBundle() : Mask(0), FixCount(0) {} + LiveBundle() = default; // Have the live registers been assigned a stack order yet? bool isFixed() const { return !Mask || FixCount; } @@ -866,7 +866,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { if (Opcode != -1) { I->setDesc(TII->get(Opcode)); if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr) - I->RemoveOperand(0); + I->removeOperand(0); MI.dropDebugNumber(); } else { // Insert an explicit pop // If this instruction sets FPSW, which is read in following instruction, @@ -1034,7 +1034,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { STReturns |= 1 << getFPReg(Op); // Remove the operand so that later passes don't see it. - MI.RemoveOperand(i); + MI.removeOperand(i); --i; --e; } @@ -1098,7 +1098,7 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) { LiveMask |= (1 << getFPReg(Op)); // Remove the operand so that later passes don't see it. - MI.RemoveOperand(i); + MI.removeOperand(i); --i; --e; } @@ -1162,7 +1162,7 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { unsigned DestReg = getFPReg(MI.getOperand(0)); // Change from the pseudo instruction to the concrete instruction. - MI.RemoveOperand(0); // Remove the explicit ST(0) operand + MI.removeOperand(0); // Remove the explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); MI.addOperand( MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true)); @@ -1210,7 +1210,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { } // Convert from the pseudo instruction to the concrete instruction. - MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand + MI.removeOperand(NumOps - 1); // Remove explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); MI.addOperand( MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true)); @@ -1263,8 +1263,8 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { } // Change from the pseudo instruction to the concrete instruction. - MI.RemoveOperand(1); // Drop the source operand. - MI.RemoveOperand(0); // Drop the destination operand. + MI.removeOperand(1); // Drop the source operand. + MI.removeOperand(0); // Drop the destination operand. MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); MI.dropDebugNumber(); } @@ -1464,7 +1464,7 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { // Change from the pseudo instruction to the concrete instruction. MI.getOperand(0).setReg(getSTReg(Op1)); - MI.RemoveOperand(1); + MI.removeOperand(1); MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); MI.dropDebugNumber(); @@ -1489,8 +1489,8 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { // Change the second operand to the stack register that the operand is in. // Change from the pseudo instruction to the concrete instruction. - MI.RemoveOperand(0); - MI.RemoveOperand(1); + MI.removeOperand(0); + MI.removeOperand(1); MI.getOperand(0).setReg(getSTReg(Op1)); MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); MI.dropDebugNumber(); diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 51f2ced321bb..d524090f902e 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "X86FrameLowering.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" @@ -19,6 +20,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -99,7 +101,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MF.getInfo()->hasPreallocatedCall() || MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() || MFI.hasStackMap() || MFI.hasPatchPoint() || - MFI.hasCopyImplyingStackAdjustment()); + (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment())); } static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { @@ -435,11 +437,13 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - const MCCFIInstruction &CFIInst) const { + const MCCFIInstruction &CFIInst, + MachineInstr::MIFlag Flag) const { MachineFunction &MF = *MBB.getParent(); unsigned CFIIndex = MF.addFrameInst(CFIInst); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlag(Flag); } /// Emits Dwarf Info specifying offsets of callee saved registers and @@ -492,6 +496,87 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( } } +void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, + MachineBasicBlock &MBB) const { + const MachineFunction &MF = *MBB.getParent(); + + // Insertion point. + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + + // Fake a debug loc. + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + + // Zero out FP stack if referenced. Do this outside of the loop below so that + // it's done only once. + const X86Subtarget &ST = MF.getSubtarget(); + for (MCRegister Reg : RegsToZero.set_bits()) { + if (!X86::RFP80RegClass.contains(Reg)) + continue; + + unsigned NumFPRegs = ST.is64Bit() ? 8 : 7; + for (unsigned i = 0; i != NumFPRegs; ++i) + BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0)); + + for (unsigned i = 0; i != NumFPRegs; ++i) + BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0); + break; + } + + // For GPRs, we only care to clear out the 32-bit register. + BitVector GPRsToZero(TRI->getNumRegs()); + for (MCRegister Reg : RegsToZero.set_bits()) + if (TRI->isGeneralPurposeRegister(MF, Reg)) { + GPRsToZero.set(getX86SubSuperRegisterOrZero(Reg, 32)); + RegsToZero.reset(Reg); + } + + for (MCRegister Reg : GPRsToZero.set_bits()) + BuildMI(MBB, MBBI, DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Zero out registers. + for (MCRegister Reg : RegsToZero.set_bits()) { + if (ST.hasMMX() && X86::VR64RegClass.contains(Reg)) + // FIXME: Ignore MMX registers? + continue; + + unsigned XorOp; + if (X86::VR128RegClass.contains(Reg)) { + // XMM# + if (!ST.hasSSE1()) + continue; + XorOp = X86::PXORrr; + } else if (X86::VR256RegClass.contains(Reg)) { + // YMM# + if (!ST.hasAVX()) + continue; + XorOp = X86::VPXORrr; + } else if (X86::VR512RegClass.contains(Reg)) { + // ZMM# + if (!ST.hasAVX512()) + continue; + XorOp = X86::VPXORYrr; + } else if (X86::VK1RegClass.contains(Reg) || + X86::VK2RegClass.contains(Reg) || + X86::VK4RegClass.contains(Reg) || + X86::VK8RegClass.contains(Reg) || + X86::VK16RegClass.contains(Reg)) { + if (!ST.hasVLX()) + continue; + XorOp = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr; + } else { + continue; + } + + BuildMI(MBB, MBBI, DL, TII.get(XorOp), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + } +} + void X86FrameLowering::emitStackProbe( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog, @@ -1289,6 +1374,9 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone); } +/// Return true if we need to use the restricted Windows x64 prologue and +/// epilogue code patterns that can be described with WinCFI (.seh_* +/// directives). bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const { return MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); } @@ -1558,12 +1646,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth), + MachineInstr::FrameSetup); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( - nullptr, DwarfFramePtr, 2 * stackGrowth)); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset(nullptr, DwarfFramePtr, + 2 * stackGrowth), + MachineInstr::FrameSetup); } if (NeedsWinCFI) { @@ -1630,7 +1721,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI( MBB, MBBI, DL, - MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr), + MachineInstr::FrameSetup); } if (NeedsWinFPO) { @@ -1681,7 +1773,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset), + MachineInstr::FrameSetup); StackOffset += stackGrowth; } @@ -1962,7 +2055,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, assert(StackSize); BuildCFI( MBB, MBBI, DL, - MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth), + MachineInstr::FrameSetup); } // Emit DWARF info specifying the offsets of the callee-saved registers. @@ -2145,11 +2239,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned DwarfStackPtr = TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true); BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize)); + MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize), + MachineInstr::FrameDestroy); if (!MBB.succ_empty() && !MBB.isReturnBlock()) { unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, AfterPop, DL, - MCCFIInstruction::createRestore(nullptr, DwarfFramePtr)); + MCCFIInstruction::createRestore(nullptr, DwarfFramePtr), + MachineInstr::FrameDestroy); --MBBI; --AfterPop; } @@ -2226,7 +2322,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Define the current CFA rule to use the provided offset. BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset( - nullptr, CSSize + TailCallArgReserveSize + SlotSize)); + nullptr, CSSize + TailCallArgReserveSize + SlotSize), + MachineInstr::FrameDestroy); } --MBBI; } @@ -2252,7 +2349,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Opc == X86::POP32r || Opc == X86::POP64r) { Offset += SlotSize; BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset), + MachineInstr::FrameDestroy); } } } @@ -2830,17 +2928,8 @@ void X86FrameLowering::adjustForSegmentedStacks( // prologue. StackSize = MFI.getStackSize(); - // Do not generate a prologue for leaf functions with a stack of size zero. - // For non-leaf functions we have to allow for the possibility that the - // callis to a non-split function, as in PR37807. This function could also - // take the address of a non-split function. When the linker tries to adjust - // its non-existent prologue, it would fail with an error. Mark the object - // file so that such failures are not errors. See this Go language bug-report - // https://go-review.googlesource.com/c/go/+/148819/ - if (StackSize == 0 && !MFI.hasTailCall()) { - MF.getMMI().setHasNosplitStack(true); + if (!MFI.needsSplitStackProlog()) return; - } MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); @@ -3023,7 +3112,6 @@ void X86FrameLowering::adjustForSegmentedStacks( .addReg(0) .addExternalSymbol("__morestack_addr") .addReg(0); - MF.getMMI().setUsesMorestackAddr(true); } else { if (Is64Bit) BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 987facbfeae4..9b83fe77d505 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -176,7 +176,8 @@ public: /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; + const DebugLoc &DL, const MCCFIInstruction &CFIInst, + MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const; /// Sets up EBP and optionally ESI based on the incoming EBP value. Only /// needed for 32-bit. Used in funclet prologues and at catchret destinations. @@ -233,6 +234,10 @@ private: const DebugLoc &DL, uint64_t Offset, uint64_t Align) const; + /// Emit target zero call-used regs. + void emitZeroCallUsedRegs(BitVector RegsToZero, + MachineBasicBlock &MBB) const override; + void adjustFrameForMsvcCxxEh(MachineFunction &MF) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 5b90c67deae6..f88037e95d33 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -59,30 +59,27 @@ namespace { enum { RegBase, FrameIndexBase - } BaseType; + } BaseType = RegBase; // This is really a union, discriminated by BaseType! SDValue Base_Reg; - int Base_FrameIndex; + int Base_FrameIndex = 0; - unsigned Scale; + unsigned Scale = 1; SDValue IndexReg; - int32_t Disp; + int32_t Disp = 0; SDValue Segment; - const GlobalValue *GV; - const Constant *CP; - const BlockAddress *BlockAddr; - const char *ES; - MCSymbol *MCSym; - int JT; + const GlobalValue *GV = nullptr; + const Constant *CP = nullptr; + const BlockAddress *BlockAddr = nullptr; + const char *ES = nullptr; + MCSymbol *MCSym = nullptr; + int JT = -1; Align Alignment; // CP alignment. - unsigned char SymbolFlags; // X86II::MO_* + unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* bool NegateIndex = false; - X86ISelAddressMode() - : BaseType(RegBase), Base_FrameIndex(0), Scale(1), Disp(0), GV(nullptr), - CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1), - SymbolFlags(X86II::MO_NO_FLAG) {} + X86ISelAddressMode() = default; bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || @@ -446,6 +443,43 @@ namespace { return getI8Imm(InsertIdx ? 0x02 : 0x30, DL); } + SDValue getSBBZero(SDNode *N) { + SDLoc dl(N); + MVT VT = N->getSimpleValueType(0); + + // Create zero. + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); + SDValue Zero = + SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); + if (VT == MVT::i64) { + Zero = SDValue( + CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, + CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), + 0); + } + + // Copy flags to the EFLAGS register and glue it to next node. + unsigned Opcode = N->getOpcode(); + assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && + "Unexpected opcode for SBB materialization"); + unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + N->getOperand(FlagOpIndex), SDValue()); + + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; + MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + VTs = CurDAG->getVTList(SBBVT, MVT::i32); + return SDValue( + CurDAG->getMachineNode(Opc, dl, VTs, + {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}), + 0); + } + // Helper to detect unneeded and instructions on shift amounts. Called // from PatFrags in tablegen. bool isUnneededShiftMask(SDNode *N, unsigned Width) const { @@ -476,6 +510,9 @@ namespace { return Subtarget->getInstrInfo(); } + /// Return a condition code of the given SDNode + X86::CondCode getCondFromNode(SDNode *N) const; + /// Address-mode matching performs shift-of-and to and-of-shift /// reassociation in order to expose more scaled addressing /// opportunities. @@ -492,7 +529,7 @@ namespace { unsigned StoreSize = N->getMemoryVT().getStoreSize(); - if (N->getAlignment() < StoreSize) + if (N->getAlign().value() < StoreSize) return false; switch (StoreSize) { @@ -2391,6 +2428,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, return false; break; + case ISD::XOR: + // We want to look through a transform in InstCombine that + // turns 'add' with min_signed_val into 'xor', so we can treat this 'xor' + // exactly like an 'add'. + if (isMinSignedConstant(N.getOperand(1)) && !matchAdd(N, AM, Depth)) + return false; + break; + case ISD::AND: { // Perform some heroic transforms on an and of a constant-count shift // with a constant to enable use of the scaled offset field. @@ -2745,10 +2790,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, case X86ISD::SUB: case X86ISD::ADC: case X86ISD::SBB: - /* TODO: These opcodes can be added safely, but we may want to justify - their inclusion for different reasons (better for reg-alloc). case X86ISD::SMUL: case X86ISD::UMUL: + /* TODO: These opcodes can be added safely, but we may want to justify + their inclusion for different reasons (better for reg-alloc). case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: @@ -2759,10 +2804,9 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, return false; } }; - // TODO: This could be an 'or' rather than 'and' to make the transform more - // likely to happen. We might want to factor in whether there's a - // load folding opportunity for the math op that disappears with LEA. - if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) + // TODO: We might want to factor in whether there's a load folding + // opportunity for the math op that disappears with LEA. + if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1))) Complexity++; } @@ -2891,24 +2935,15 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { CR->getSignedMax().slt(1ull << Width); } -static X86::CondCode getCondFromNode(SDNode *N) { +X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { assert(N->isMachineOpcode() && "Unexpected node"); - X86::CondCode CC = X86::COND_INVALID; unsigned Opc = N->getMachineOpcode(); - if (Opc == X86::JCC_1) - CC = static_cast(N->getConstantOperandVal(1)); - else if (Opc == X86::SETCCr) - CC = static_cast(N->getConstantOperandVal(0)); - else if (Opc == X86::SETCCm) - CC = static_cast(N->getConstantOperandVal(5)); - else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || - Opc == X86::CMOV64rr) - CC = static_cast(N->getConstantOperandVal(2)); - else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || - Opc == X86::CMOV64rm) - CC = static_cast(N->getConstantOperandVal(6)); - - return CC; + const MCInstrDesc &MCID = getInstrInfo()->get(Opc); + int CondNo = X86::getCondSrcNoFromDesc(MCID); + if (CondNo < 0) + return X86::COND_INVALID; + + return static_cast(N->getConstantOperandVal(CondNo)); } /// Test whether the given X86ISD::CMP node has any users that use a flag @@ -3464,7 +3499,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { const bool AllowExtraUsesByDefault = Subtarget->hasBMI2(); auto checkUses = [AllowExtraUsesByDefault](SDValue Op, unsigned NUses, Optional AllowExtraUses) { - return AllowExtraUses.getValueOr(AllowExtraUsesByDefault) || + return AllowExtraUses.value_or(AllowExtraUsesByDefault) || Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); }; auto checkOneUse = [checkUses](SDValue Op, @@ -5478,7 +5513,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { MVT CmpVT = N0.getSimpleValueType(); // Floating point needs special handling if we don't have FCOMI. - if (Subtarget->hasCMov()) + if (Subtarget->canUseCMOV()) break; bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; @@ -5518,7 +5553,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Move AH into flags. // Some 64-bit targets lack SAHF support, but they do support FCOMI. - assert(Subtarget->hasLAHFSAHF() && + assert(Subtarget->canUseLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); Chain = AH; @@ -5567,40 +5602,86 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. - if (N0.getOpcode() == ISD::AND && - N0.getNode()->hasOneUse() && + if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8) { - ConstantSDNode *C = dyn_cast(N0.getOperand(1)); - if (!C) break; - uint64_t Mask = C->getZExtValue(); + auto *MaskC = dyn_cast(N0.getOperand(1)); + if (!MaskC) + break; + // We may have looked through a truncate so mask off any bits that // shouldn't be part of the compare. + uint64_t Mask = MaskC->getZExtValue(); Mask &= maskTrailingOnes(CmpVT.getScalarSizeInBits()); - // Check if we can replace AND+IMM64 with a shift. This is possible for - // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero - // flag. - if (CmpVT == MVT::i64 && !isInt<32>(Mask) && + // Check if we can replace AND+IMM{32,64} with a shift. This is possible + // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the + // zero flag. + if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) && onlyUsesZeroFlag(SDValue(Node, 0))) { - if (isMask_64(~Mask)) { - unsigned TrailingZeros = countTrailingZeros(Mask); - SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); - SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, - N0.getOperand(0), Imm), 0); - MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, - MVT::i32, Shift, Shift); - ReplaceNode(Node, Test); - return; + unsigned ShiftOpcode = ISD::DELETED_NODE; + unsigned ShiftAmt; + unsigned SubRegIdx; + MVT SubRegVT; + unsigned TestOpcode; + unsigned LeadingZeros = countLeadingZeros(Mask); + unsigned TrailingZeros = countTrailingZeros(Mask); + + // With leading/trailing zeros, the transform is profitable if we can + // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without + // incurring any extra register moves. + bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse(); + if (LeadingZeros == 0 && SavesBytes) { + // If the mask covers the most significant bit, then we can replace + // TEST+AND with a SHR and check eflags. + // This emits a redundant TEST which is subsequently eliminated. + ShiftOpcode = X86::SHR64ri; + ShiftAmt = TrailingZeros; + SubRegIdx = 0; + TestOpcode = X86::TEST64rr; + } else if (TrailingZeros == 0 && SavesBytes) { + // If the mask covers the least significant bit, then we can replace + // TEST+AND with a SHL and check eflags. + // This emits a redundant TEST which is subsequently eliminated. + ShiftOpcode = X86::SHL64ri; + ShiftAmt = LeadingZeros; + SubRegIdx = 0; + TestOpcode = X86::TEST64rr; + } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) { + // If the shifted mask extends into the high half and is 8/16/32 bits + // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. + unsigned PopCount = 64 - LeadingZeros - TrailingZeros; + if (PopCount == 8) { + ShiftOpcode = X86::SHR64ri; + ShiftAmt = TrailingZeros; + SubRegIdx = X86::sub_8bit; + SubRegVT = MVT::i8; + TestOpcode = X86::TEST8rr; + } else if (PopCount == 16) { + ShiftOpcode = X86::SHR64ri; + ShiftAmt = TrailingZeros; + SubRegIdx = X86::sub_16bit; + SubRegVT = MVT::i16; + TestOpcode = X86::TEST16rr; + } else if (PopCount == 32) { + ShiftOpcode = X86::SHR64ri; + ShiftAmt = TrailingZeros; + SubRegIdx = X86::sub_32bit; + SubRegVT = MVT::i32; + TestOpcode = X86::TEST32rr; + } } - if (isMask_64(Mask)) { - unsigned LeadingZeros = countLeadingZeros(Mask); - SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); - SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, - N0.getOperand(0), Imm), 0); - MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, - MVT::i32, Shift, Shift); + if (ShiftOpcode != ISD::DELETED_NODE) { + SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64); + SDValue Shift = SDValue( + CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32, + N0.getOperand(0), ShiftC), + 0); + if (SubRegIdx != 0) { + Shift = + CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift); + } + MachineSDNode *Test = + CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift); ReplaceNode(Node, Test); return; } @@ -5769,21 +5850,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case X86ISD::SETCC_CARRY: { - // We have to do this manually because tblgen will put the eflags copy in - // the wrong place if we use an extract_subreg in the pattern. MVT VT = Node->getSimpleValueType(0); + SDValue Result; + if (Subtarget->hasSBBDepBreaking()) { + // We have to do this manually because tblgen will put the eflags copy in + // the wrong place if we use an extract_subreg in the pattern. + // Copy flags to the EFLAGS register and glue it to next node. + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + Node->getOperand(1), SDValue()); - // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = - CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, - Node->getOperand(1), SDValue()); - - // Create a 64-bit instruction if the result is 64-bits otherwise use the - // 32-bit version. - unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; - MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; - SDValue Result = SDValue( - CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; + MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + Result = SDValue( + CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), + 0); + } else { + // The target does not recognize sbb with the same reg operand as a + // no-source idiom, so we explicitly zero the input values. + Result = getSBBZero(Node); + } // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { @@ -5798,35 +5886,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::SBB: { if (isNullConstant(Node->getOperand(0)) && isNullConstant(Node->getOperand(1))) { - MVT VT = Node->getSimpleValueType(0); - - // Create zero. - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); - SDValue Zero = - SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); - if (VT == MVT::i64) { - Zero = SDValue( - CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, - CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, - CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), - 0); - } - - // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = - CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, - Node->getOperand(2), SDValue()); - - // Create a 64-bit instruction if the result is 64-bits otherwise use the - // 32-bit version. - unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; - MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; - VTs = CurDAG->getVTList(SBBVT, MVT::i32); - SDValue Result = - SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, - EFLAGS.getValue(1)}), - 0); + SDValue Result = getSBBZero(Node); // Replace the flag use. ReplaceUses(SDValue(Node, 1), Result.getValue(1)); @@ -5834,6 +5894,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Replace the result use. if (!SDValue(Node, 0).use_empty()) { // For less than 32-bits we need to extract from the 32-bit node. + MVT VT = Node->getSimpleValueType(0); if (VT == MVT::i8 || VT == MVT::i16) { int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); @@ -6112,6 +6173,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: + case InlineAsm::Constraint_p: // address if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 90753b5b4d33..61c1fd25031d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); - X86ScalarSSEf64 = Subtarget.hasSSE2(); - X86ScalarSSEf32 = Subtarget.hasSSE1(); - X86ScalarSSEf16 = Subtarget.hasFP16(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. @@ -170,7 +167,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. // FIXME: Should we be limiting the atomic size on other configs? Default is // 1024. - if (!Subtarget.hasCmpxchg8b()) + if (!Subtarget.canUseCMPXCHG8B()) setMaxAtomicSizeInBitsSupported(32); // Set up the register classes. @@ -200,7 +197,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Integer absolute. - if (Subtarget.hasCMov()) { + if (Subtarget.canUseCMOV()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); if (Subtarget.is64Bit()) @@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. - if (!X86ScalarSSEf64) { + if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { @@ -415,14 +412,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f128, Expand); } - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f80, MVT::f16, Expand); - setTruncStoreAction(MVT::f128, MVT::f16, Expand); + for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand); + setTruncStoreAction(VT, MVT::f16, Expand); + setTruncStoreAction(VT, MVT::bf16, Expand); + + setOperationAction(ISD::BF16_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_BF16, VT, Expand); + } setOperationAction(ISD::PARITY, MVT::i8, Custom); setOperationAction(ISD::PARITY, MVT::i16, Custom); @@ -497,7 +495,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, VT, Custom); } - if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) + if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); @@ -516,9 +514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); - if (Subtarget.hasCmpxchg16b()) { + if (Subtarget.canUseCMPXCHG16B()) setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); - } // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && @@ -535,7 +532,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); - if (Subtarget.getTargetTriple().isPS4CPU()) + if (Subtarget.isTargetPS()) setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand); else setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); @@ -556,9 +553,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); - if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { - // f32 and f64 use SSE. + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { + // f16, f32 and f64 use SSE. // Set up the FP register classes. + addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass + : &X86::FR16RegClass); addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass @@ -590,11 +591,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSINCOS, VT, Expand); } + // Half type will be promoted by default. + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); - } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 && + } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. @@ -664,6 +708,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } + // Support fp16 0 immediate. + if (isTypeLegal(MVT::f16)) + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); + // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); @@ -673,7 +721,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); @@ -725,7 +772,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + if (isTypeLegal(MVT::f16)) { + setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); + } else { + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + } // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); @@ -877,7 +929,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. - if (VT.getVectorElementType() == MVT::f16) + if (VT.getVectorElementType() == MVT::f16 || + VT.getVectorElementType() == MVT::bf16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } @@ -949,6 +1002,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); + setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); @@ -1067,6 +1122,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); + // Add 32-bit vector stores to help vectorization opportunities. + setOperationAction(ISD::STORE, MVT::v2i16, Custom); + setOperationAction(ISD::STORE, MVT::v4i8, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); @@ -1285,13 +1344,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (VT == MVT::v4i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Custom); + setOperationAction(ISD::FSHR, VT, Custom); } - setOperationAction(ISD::FSHL, MVT::v32i8, Custom); - setOperationAction(ISD::FSHR, MVT::v32i8, Custom); - setOperationAction(ISD::FSHL, MVT::v8i32, Custom); - setOperationAction(ISD::FSHR, MVT::v8i32, Custom); - // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); @@ -1353,6 +1409,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); + setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); @@ -1446,6 +1504,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } + if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + } + // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. @@ -1652,6 +1717,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); + setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); @@ -1698,6 +1765,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHL, MVT::v64i8, Custom); setOperationAction(ISD::FSHR, MVT::v64i8, Custom); + setOperationAction(ISD::FSHL, MVT::v32i16, Custom); + setOperationAction(ISD::FSHR, MVT::v32i16, Custom); setOperationAction(ISD::FSHL, MVT::v16i32, Custom); setOperationAction(ISD::FSHR, MVT::v16i32, Custom); @@ -1970,10 +2039,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); - if (isTypeLegal(MVT::f80)) { - setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); - } setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); @@ -2059,9 +2124,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v4f16, Custom); setOperationAction(ISD::STORE, MVT::v4f16, Custom); } - - // Support fp16 0 immediate - addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { @@ -2209,55 +2271,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f32, Promote); // We have target-specific dag combine patterns for the following nodes: - setTargetDAGCombine(ISD::VECTOR_SHUFFLE); - setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::CONCAT_VECTORS); - setTargetDAGCombine(ISD::INSERT_SUBVECTOR); - setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); - setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::VSELECT); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SRL); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FNEG); - setTargetDAGCombine(ISD::FMA); - setTargetDAGCombine(ISD::STRICT_FMA); - setTargetDAGCombine(ISD::FMINNUM); - setTargetDAGCombine(ISD::FMAXNUM); - setTargetDAGCombine(ISD::SUB); - setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::MLOAD); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::MSTORE); - setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); - setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); - setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); - setTargetDAGCombine(ISD::SINT_TO_FP); - setTargetDAGCombine(ISD::UINT_TO_FP); - setTargetDAGCombine(ISD::STRICT_SINT_TO_FP); - setTargetDAGCombine(ISD::STRICT_UINT_TO_FP); - setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::XOR); - setTargetDAGCombine(ISD::MSCATTER); - setTargetDAGCombine(ISD::MGATHER); - setTargetDAGCombine(ISD::FP16_TO_FP); - setTargetDAGCombine(ISD::FP_EXTEND); - setTargetDAGCombine(ISD::STRICT_FP_EXTEND); - setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine({ISD::VECTOR_SHUFFLE, + ISD::SCALAR_TO_VECTOR, + ISD::INSERT_VECTOR_ELT, + ISD::EXTRACT_VECTOR_ELT, + ISD::CONCAT_VECTORS, + ISD::INSERT_SUBVECTOR, + ISD::EXTRACT_SUBVECTOR, + ISD::BITCAST, + ISD::VSELECT, + ISD::SELECT, + ISD::SHL, + ISD::SRA, + ISD::SRL, + ISD::OR, + ISD::AND, + ISD::ADD, + ISD::FADD, + ISD::FSUB, + ISD::FNEG, + ISD::FMA, + ISD::STRICT_FMA, + ISD::FMINNUM, + ISD::FMAXNUM, + ISD::SUB, + ISD::LOAD, + ISD::MLOAD, + ISD::STORE, + ISD::MSTORE, + ISD::TRUNCATE, + ISD::ZERO_EXTEND, + ISD::ANY_EXTEND, + ISD::SIGN_EXTEND, + ISD::SIGN_EXTEND_INREG, + ISD::ANY_EXTEND_VECTOR_INREG, + ISD::SIGN_EXTEND_VECTOR_INREG, + ISD::ZERO_EXTEND_VECTOR_INREG, + ISD::SINT_TO_FP, + ISD::UINT_TO_FP, + ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP, + ISD::SETCC, + ISD::MUL, + ISD::XOR, + ISD::MSCATTER, + ISD::MGATHER, + ISD::FP16_TO_FP, + ISD::FP_EXTEND, + ISD::STRICT_FP_EXTEND, + ISD::FP_ROUND}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -2568,9 +2630,9 @@ EVT X86TargetLowering::getOptimalMemOpType( bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) - return X86ScalarSSEf32; + return Subtarget.hasSSE1(); if (VT == MVT::f64) - return X86ScalarSSEf64; + return Subtarget.hasSSE2(); return true; } @@ -3566,10 +3628,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, MFI.setObjectSExt(FI, true); } + MaybeAlign Alignment; + if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && + ValVT != MVT::f80) + Alignment = MaybeAlign(4); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + Alignment); return ExtendedInMem ? (VA.getValVT().isVector() ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) @@ -3906,7 +3973,7 @@ SDValue X86TargetLowering::LowerFormalArguments( else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f16) - RC = &X86::FR16XRegClass; + RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) @@ -4088,9 +4155,14 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, if (isByVal) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + MaybeAlign Alignment; + if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && + Arg.getSimpleValueType() != MVT::f80) + Alignment = MaybeAlign(4); return DAG.getStore( Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + Alignment); } /// Emit a load of return address if tail call @@ -5076,7 +5148,7 @@ bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, // If this is an unaligned vector, make sure the target supports folding it. auto *Ld = cast(Op.getNode()); if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && - Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16) + Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) return false; // TODO: If this is a non-temporal load and the target has an instruction @@ -5171,13 +5243,6 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { } } -static bool isTargetShuffleSplat(SDValue Op) { - unsigned Opcode = Op.getOpcode(); - if (Opcode == ISD::EXTRACT_SUBVECTOR) - return isTargetShuffleSplat(Op.getOperand(0)); - return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD; -} - SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -5429,6 +5494,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; return true; + case Intrinsic::x86_atomic_bts: + case Intrinsic::x86_atomic_btc: + case Intrinsic::x86_atomic_btr: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(0); + unsigned Size = I.getType()->getScalarSizeInBits(); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); + Info.align = Align(Size); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; + return true; + } } return false; } @@ -5643,6 +5720,22 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const { + return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); +} + +bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { + // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more + // expensive than a straight movsd. On the other hand, it's important to + // shrink long double fp constant since fldt is very slow. + return !Subtarget.hasSSE2() || VT == MVT::f80; +} + +bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && Subtarget.hasSSE2()) || + (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; +} + bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { @@ -5755,6 +5848,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask( (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); + // TODO: Should we always create i64 masks? Or only folded immediates? EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { @@ -6281,7 +6375,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, // Helper function to collect subvector ops that are concatenated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. -static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { +static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops, + SelectionDAG &DAG) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { @@ -6297,21 +6392,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { EVT SubVT = Sub.getValueType(); // TODO - Handle more general insert_subvector chains. - if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && - Idx == (VT.getVectorNumElements() / 2)) { - // insert_subvector(insert_subvector(undef, x, lo), y, hi) - if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && - Src.getOperand(1).getValueType() == SubVT && - isNullConstant(Src.getOperand(2))) { - Ops.push_back(Src.getOperand(1)); + if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) { + // insert_subvector(undef, x, lo) + if (Idx == 0 && Src.isUndef()) { Ops.push_back(Sub); + Ops.push_back(DAG.getUNDEF(SubVT)); return true; } - // insert_subvector(x, extract_subvector(x, lo), hi) - if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && - Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { - Ops.append(2, Sub); - return true; + if (Idx == (VT.getVectorNumElements() / 2)) { + // insert_subvector(insert_subvector(undef, x, lo), y, hi) + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueType() == SubVT && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + // insert_subvector(x, extract_subvector(x, lo), hi) + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { + Ops.append(2, Sub); + return true; + } + // insert_subvector(undef, x, hi) + if (Src.isUndef()) { + Ops.push_back(DAG.getUNDEF(SubVT)); + Ops.push_back(Sub); + return true; + } } } } @@ -6770,7 +6878,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { } } SmallVector CatOps; - if (collectConcatOps(V.getNode(), CatOps)) { + if (collectConcatOps(V.getNode(), CatOps, DAG)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); @@ -7934,8 +8042,35 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl &Mask, } } +// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask. +static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, + SDValue Cond, bool IsBLENDV = false) { + EVT CondVT = Cond.getValueType(); + unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); + unsigned NumElts = CondVT.getVectorNumElements(); + + APInt UndefElts; + SmallVector EltBits; + if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, + true, false)) + return false; + + Mask.resize(NumElts, SM_SentinelUndef); + + for (int i = 0; i != (int)NumElts; ++i) { + Mask[i] = i; + // Arbitrarily choose from the 2nd operand if the select condition element + // is undef. + // TODO: Can we do better by matching patterns such as even/odd? + if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) || + (IsBLENDV && EltBits[i].isNonNegative())) + Mask[i] += NumElts; + } + + return true; +} + // Forward declaration (for getFauxShuffleMask recursive check). -// TODO: Use DemandedElts variant. static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth, @@ -7987,11 +8122,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; + // We can't assume an undef src element gives an undef dst - the other src + // might be zero. + if (!UndefElts.isZero()) + return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { - if (UndefElts[i]) { - Mask.push_back(SM_SentinelUndef); - continue; - } const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; @@ -8240,6 +8375,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } return true; } + case ISD::VSELECT: + case X86ISD::BLENDV: { + SDValue Cond = N.getOperand(0); + if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) { + Ops.push_back(N.getOperand(1)); + Ops.push_back(N.getOperand(2)); + return true; + } + return false; + } case X86ISD::VTRUNC: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); @@ -9076,7 +9221,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. - if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && + if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); @@ -9462,7 +9607,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + if (ScalarSize == 32 || + (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; @@ -11651,33 +11797,6 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef Mask, return true; } -// Attempt to create a shuffle mask from a VSELECT condition mask. -static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, - SDValue Cond) { - EVT CondVT = Cond.getValueType(); - unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); - unsigned NumElts = CondVT.getVectorNumElements(); - - APInt UndefElts; - SmallVector EltBits; - if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, - true, false)) - return false; - - Mask.resize(NumElts, SM_SentinelUndef); - - for (int i = 0; i != (int)NumElts; ++i) { - Mask[i] = i; - // Arbitrarily choose from the 2nd operand if the select condition element - // is undef. - // TODO: Can we do better by matching patterns such as even/odd? - if (UndefElts[i] || EltBits[i].isZero()) - Mask[i] += NumElts; - } - - return true; -} - // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { @@ -13943,8 +14062,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { - V = peekThroughBitcasts(V); - return ISD::isNON_EXTLoad(V.getNode()); + return V->hasOneUse() && + ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } /// Try to lower insertion of a single element into a zero vector. @@ -15796,7 +15915,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, V1 = extract128BitVector(V1V2, 0, DAG, DL); V2 = extract128BitVector(V1V2, 4, DAG, DL); } else { - SmallVector DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32)); + SmallVector DWordClearOps(4, + DAG.getConstant(0, DL, MVT::i32)); for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1)) DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32); SDValue DWordClearMask = @@ -16615,9 +16735,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( // otherwise we're (probably) better off doing a split. if (VT == MVT::v4f64 && !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) - if (SDValue V = - lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG)) - return V; + return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG); // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element @@ -17229,114 +17347,135 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( return SDValue(); // Bail if we already have a repeated lane shuffle mask. - SmallVector RepeatedShuffleMask; - if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) + if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); - // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes - // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. - int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; - int NumSubLanes = NumLanes * SubLaneScale; - int NumSubLaneElts = NumLaneElts / SubLaneScale; - - // Check that all the sources are coming from the same lane and see if we can - // form a repeating shuffle mask (local to each sub-lane). At the same time, - // determine the source sub-lane for each destination sub-lane. - int TopSrcSubLane = -1; - SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); - SmallVector RepeatedSubLaneMasks[2] = { - SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef), - SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)}; - - for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { - // Extract the sub-lane mask, check that it all comes from the same lane - // and normalize the mask entries to come from the first lane. - int SrcLane = -1; - SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); - for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { - int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; - if (M < 0) + // Helper to look for repeated mask in each split sublane, and that those + // sublanes can then be permuted into place. + auto ShuffleSubLanes = [&](int SubLaneScale) { + int NumSubLanes = NumLanes * SubLaneScale; + int NumSubLaneElts = NumLaneElts / SubLaneScale; + + // Check that all the sources are coming from the same lane and see if we + // can form a repeating shuffle mask (local to each sub-lane). At the same + // time, determine the source sub-lane for each destination sub-lane. + int TopSrcSubLane = -1; + SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); + SmallVector> RepeatedSubLaneMasks( + SubLaneScale, + SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)); + + for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { + // Extract the sub-lane mask, check that it all comes from the same lane + // and normalize the mask entries to come from the first lane. + int SrcLane = -1; + SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); + for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { + int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; + if (M < 0) + continue; + int Lane = (M % NumElts) / NumLaneElts; + if ((0 <= SrcLane) && (SrcLane != Lane)) + return SDValue(); + SrcLane = Lane; + int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); + SubLaneMask[Elt] = LocalM; + } + + // Whole sub-lane is UNDEF. + if (SrcLane < 0) continue; - int Lane = (M % NumElts) / NumLaneElts; - if ((0 <= SrcLane) && (SrcLane != Lane)) - return SDValue(); - SrcLane = Lane; - int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); - SubLaneMask[Elt] = LocalM; - } - // Whole sub-lane is UNDEF. - if (SrcLane < 0) - continue; + // Attempt to match against the candidate repeated sub-lane masks. + for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { + auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { + for (int i = 0; i != NumSubLaneElts; ++i) { + if (M1[i] < 0 || M2[i] < 0) + continue; + if (M1[i] != M2[i]) + return false; + } + return true; + }; + + auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; + if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) + continue; - // Attempt to match against the candidate repeated sub-lane masks. - for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { - auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { + // Merge the sub-lane mask into the matching repeated sub-lane mask. for (int i = 0; i != NumSubLaneElts; ++i) { - if (M1[i] < 0 || M2[i] < 0) + int M = SubLaneMask[i]; + if (M < 0) continue; - if (M1[i] != M2[i]) - return false; + assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && + "Unexpected mask element"); + RepeatedSubLaneMask[i] = M; } - return true; - }; - auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; - if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) - continue; + // Track the top most source sub-lane - by setting the remaining to + // UNDEF we can greatly simplify shuffle matching. + int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; + TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); + Dst2SrcSubLanes[DstSubLane] = SrcSubLane; + break; + } - // Merge the sub-lane mask into the matching repeated sub-lane mask. - for (int i = 0; i != NumSubLaneElts; ++i) { - int M = SubLaneMask[i]; + // Bail if we failed to find a matching repeated sub-lane mask. + if (Dst2SrcSubLanes[DstSubLane] < 0) + return SDValue(); + } + assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && + "Unexpected source lane"); + + // Create a repeating shuffle mask for the entire vector. + SmallVector RepeatedMask((unsigned)NumElts, -1); + for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { + int Lane = SubLane / SubLaneScale; + auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; + for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { + int M = RepeatedSubLaneMask[Elt]; if (M < 0) continue; - assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && - "Unexpected mask element"); - RepeatedSubLaneMask[i] = M; + int Idx = (SubLane * NumSubLaneElts) + Elt; + RepeatedMask[Idx] = M + (Lane * NumLaneElts); } - - // Track the top most source sub-lane - by setting the remaining to UNDEF - // we can greatly simplify shuffle matching. - int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; - TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); - Dst2SrcSubLanes[DstSubLane] = SrcSubLane; - break; } + SDValue RepeatedShuffle = + DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); - // Bail if we failed to find a matching repeated sub-lane mask. - if (Dst2SrcSubLanes[DstSubLane] < 0) - return SDValue(); - } - assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && - "Unexpected source lane"); - - // Create a repeating shuffle mask for the entire vector. - SmallVector RepeatedMask((unsigned)NumElts, -1); - for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { - int Lane = SubLane / SubLaneScale; - auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; - for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { - int M = RepeatedSubLaneMask[Elt]; - if (M < 0) + // Shuffle each source sub-lane to its destination. + SmallVector SubLaneMask((unsigned)NumElts, -1); + for (int i = 0; i != NumElts; i += NumSubLaneElts) { + int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; + if (SrcSubLane < 0) continue; - int Idx = (SubLane * NumSubLaneElts) + Elt; - RepeatedMask[Idx] = M + (Lane * NumLaneElts); + for (int j = 0; j != NumSubLaneElts; ++j) + SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } - } - SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); - // Shuffle each source sub-lane to its destination. - SmallVector SubLaneMask((unsigned)NumElts, -1); - for (int i = 0; i != NumElts; i += NumSubLaneElts) { - int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; - if (SrcSubLane < 0) - continue; - for (int j = 0; j != NumSubLaneElts; ++j) - SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); - } + return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), + SubLaneMask); + }; - return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), - SubLaneMask); + // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes + // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes, + // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors. + // Otherwise we can only permute whole 128-bit lanes. + int MinSubLaneScale = 1, MaxSubLaneScale = 1; + if (Subtarget.hasAVX2() && VT.is256BitVector()) { + bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts); + MinSubLaneScale = 2; + MaxSubLaneScale = + (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2; + } + if (Subtarget.hasBWI() && VT == MVT::v64i8) + MinSubLaneScale = MaxSubLaneScale = 4; + + for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2) + if (SDValue Shuffle = ShuffleSubLanes(Scale)) + return Shuffle; + + return SDValue(); } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, @@ -17513,6 +17652,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return Op; + bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); + bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); + // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently @@ -17521,13 +17663,11 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && (V1.getOpcode() != ISD::BUILD_VECTOR) && (V2.getOpcode() != ISD::BUILD_VECTOR)) - if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, - Mask, DAG)) - return Op; + return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG); // If we have one input in place, then we can permute the other input and // blend the result. - if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + if (V1IsInPlace || V2IsInPlace) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); @@ -17541,8 +17681,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || - isShuffleMaskInputInPlace(1, Mask)))) + if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace))) if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; @@ -17635,9 +17774,12 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); + bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); + // If we have one input in place, then we can permute the other input and // blend the result. - if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + if (V1IsInPlace || V2IsInPlace) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); @@ -17647,12 +17789,16 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return V; + // Try to lower to PERMQ(BLENDD(V1,V2)). + if (SDValue V = + lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return V; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (!isShuffleMaskInputInPlace(0, Mask) && - !isShuffleMaskInputInPlace(1, Mask)) + if (!V1IsInPlace && !V2IsInPlace) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; @@ -18657,20 +18803,34 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return PSHUFB; - // VBMI can use VPERMV/VPERMV3 byte shuffles. - if (Subtarget.hasVBMI()) - return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); - // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue Result = lowerShuffleAsLanePermuteAndPermute( + DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget)) + return Result; + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; + if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) { + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the + // PALIGNR will be cheaper than the second PSHUFB+OR. + if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // If we can't directly blend but can use PSHUFB, that will be better as it + // can both shuffle and set up the inefficient blend. + bool V1InUse, V2InUse; + return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable, + DAG, V1InUse, V2InUse); + } + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (!V2.isUndef()) @@ -18678,7 +18838,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Result; - // FIXME: Implement direct support for this type! + // VBMI can use VPERMV/VPERMV3 byte shuffles. + if (Subtarget.hasVBMI()) + return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -18915,7 +19078,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, Offset += NumElts; // Increment for next iteration. } - + // If we're broadcasting a SETCC result, try to broadcast the ops instead. + // TODO: What other unary shuffles would benefit from this? + if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC && + V1->hasOneUse()) { + SDValue Op0 = V1.getOperand(0); + SDValue Op1 = V1.getOperand(1); + ISD::CondCode CC = cast(V1.getOperand(2))->get(); + EVT OpVT = Op0.getValueType(); + return DAG.getSetCC( + DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask), + DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC); + } MVT ExtVT; switch (VT.SimpleTy) { @@ -19619,9 +19793,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); if (IsZeroElt || IsAllOnesElt) { - // Lower insertion of i8 -1 as an 'OR' blend. + // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend. // We don't deal with i8 0 since it appears to be handled elsewhere. - if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) { + if (IsAllOnesElt && + ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) || + ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) { SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType()); SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType()); SmallVector CstVectorElts(NumElts, ZeroCst); @@ -19652,7 +19828,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || - (Subtarget.hasAVX2() && EltVT == MVT::i32)) { + (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, DAG.getTargetConstant(1, dl, MVT::i8)); @@ -19666,7 +19842,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // If we are not inserting into the low 128-bit vector chunk, // then prefer the broadcast+blend sequence. // FIXME: relax the profitability check iff all N1 uses are insertions. - if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 && + if (IdxVal >= NumEltsIn128 && ((Subtarget.hasAVX2() && EltSizeInBits != 8) || (Subtarget.hasAVX() && (EltSizeInBits >= 32) && X86::mayFoldLoad(N1, Subtarget)))) { @@ -20617,6 +20793,35 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, return Cvt; } +template +static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { + return VT == MVT::f16 && !Subtarget.hasFP16(); +} + +template +bool X86TargetLowering::isSoftFP16(T VT) const { + return ::isSoftFP16(VT, Subtarget); +} + +static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); + MVT VT = Op.getSimpleValueType(); + MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; + SDLoc dl(Op); + + SDValue Rnd = DAG.getIntPtrConstant(0, dl); + if (IsStrict) + return DAG.getNode( + ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, + {Chain, + DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}), + Rnd}); + return DAG.getNode(ISD::FP_ROUND, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd); +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -20627,6 +20832,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (isSoftFP16(VT)) + return promoteXINT_TO_FP(Op, DAG); + if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) return LowerWin64_INT128_TO_FP(Op, DAG); @@ -21123,9 +21331,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + // Bail out when we don't have native conversion instructions. if (DstVT == MVT::f128) return SDValue(); + if (isSoftFP16(DstVT)) + return promoteXINT_TO_FP(Op, DAG); + if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -21158,9 +21370,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // The transform for i64->f64 isn't correct for 0 when rounding to negative // infinity. It produces -0.0, so disable under strictfp. - if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict) + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() && + !IsStrict) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); - if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) + // The transform for i32->f64/f32 isn't correct for 0 when rounding to + // negative infinity. So disable under strictfp. Using FILD instead. + if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 && + !IsStrict) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && (DstVT == MVT::f32 || DstVT == MVT::f64)) @@ -21819,27 +22035,25 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { - In = DAG.getBitcast(MVT::v8i32, In); - // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(4, DL)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(2, DL)); static const int ShufMask[] = {0, 2, 4, 6}; - return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); + return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo), + DAG.getBitcast(MVT::v4i32, OpHi), ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { - In = DAG.getBitcast(MVT::v32i8, In); - // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { // The PSHUFB mask: @@ -21847,27 +22061,30 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { -1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1 }; + In = DAG.getBitcast(MVT::v32i8, In); In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask2[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, - DAG.getBitcast(MVT::v16i16, In), - DAG.getIntPtrConstant(0, DL)); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0, DL)); + return DAG.getBitcast(MVT::v8i16, In); } - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, - DAG.getIntPtrConstant(16, DL)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(4, DL)); // The PSHUFB mask: - static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1}; + static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1}; - OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); + OpLo = DAG.getBitcast(MVT::v8i16, OpLo); + OpHi = DAG.getBitcast(MVT::v8i16, OpHi); + + OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); @@ -21941,6 +22158,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Res; + if (isSoftFP16(SrcVT)) { + MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; + if (IsStrict) + return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, + {NVT, MVT::Other}, {Chain, Src})}); + return DAG.getNode(Op.getOpcode(), dl, VT, + DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); + } + if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; @@ -22278,6 +22505,9 @@ SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op, SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT == MVT::f16) + return SDValue(); + // If the source is in an SSE register, the node is Legal. if (isScalarFPTypeInSSEReg(SrcVT)) return Op; @@ -22349,7 +22579,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // This code is only for floats and doubles. Fall back to generic code for // anything else. - if (!isScalarFPTypeInSSEReg(SrcVT)) + if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT)) return SDValue(); EVT SatVT = cast(Node->getOperand(1))->getVT(); @@ -22381,11 +22611,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // floating-point values. APInt MinInt, MaxInt; if (IsSigned) { - MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth); - MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth); + MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth); + MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth); } else { - MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth); - MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth); + MinInt = APInt::getMinValue(SatWidth).zext(DstWidth); + MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth); } APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT)); @@ -22484,28 +22714,54 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); - if (VT == MVT::f128) + if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80)) return SDValue(); - if (VT == MVT::f80) { - if (SVT == MVT::f16) { - assert(Subtarget.hasFP16() && "Unexpected features!"); - RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); - MakeLibCallOptions CallOptions; - std::pair Tmp = - makeLibCall(DAG, LC, VT, In, CallOptions, DL, - IsStrict ? Op.getOperand(0) : SDValue()); + if (SVT == MVT::f16) { + if (Subtarget.hasFP16()) + return Op; + + if (VT != MVT::f32) { if (IsStrict) - return DAG.getMergeValues({Tmp.first, Tmp.second}, DL); - else - return Tmp.first; + return DAG.getNode( + ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL, + {MVT::f32, MVT::Other}, {Chain, In})}); + + return DAG.getNode(ISD::FP_EXTEND, DL, VT, + DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In)); } - return Op; + + if (!Subtarget.hasF16C()) + return SDValue(); + + In = DAG.getBitcast(MVT::i16, In); + In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, + getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In, + DAG.getIntPtrConstant(0, DL)); + SDValue Res; + if (IsStrict) { + Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other}, + {Chain, In}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In, + DAG.getTargetConstant(4, DL, MVT::i32)); + } + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res, + DAG.getIntPtrConstant(0, DL)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; } + if (!SVT.isVector()) + return Op; + if (SVT.getVectorElementType() == MVT::f16) { assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); if (SVT == MVT::v2f16) @@ -22531,15 +22787,65 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); + + SDLoc DL(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); - // It's legal except when f128 is involved or we're converting f80->f16. - if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80)) - return Op; + if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) + return SDValue(); - return SDValue(); + if (VT == MVT::f16) { + if (Subtarget.hasFP16()) + return Op; + + if (SVT != MVT::f32) { + if (IsStrict) + return DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, + DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other}, + {Chain, In, Op2}), + Op2}); + + return DAG.getNode(ISD::FP_ROUND, DL, VT, + DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2), + Op2); + } + + if (!Subtarget.hasF16C()) + return SDValue(); + + SDValue Res; + SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, + MVT::i32); + if (IsStrict) { + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, + DAG.getConstantFP(0, DL, MVT::v4f32), In, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other}, + {Chain, Res, Rnd}); + Chain = Res.getValue(1); + } else { + // FIXME: Should we use zeros for upper elements for non-strict? + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In); + Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getBitcast(MVT::f16, Res); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + + return Res; + } + + return Op; } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -22857,6 +23163,47 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return Res; } +/// Helper for attempting to create a X86ISD::BT node. +static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) { + // If Src is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (Src.getValueType().getScalarSizeInBits() < 32) + Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); + + // No legal type found, give up. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType())) + return SDValue(); + + // See if we can use the 32-bit instruction instead of the 64-bit one for a + // shorter encoding. Since the former takes the modulo 32 of BitNo and the + // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is + // known to be zero. + if (Src.getValueType() == MVT::i64 && + DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) + Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (Src.getValueType() != BitNo.getValueType()) { + // Peek through a mask/modulo operation. + // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but + // we probably need a better IsDesirableToPromoteOp to handle this as well. + if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse()) + BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(), + DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), + BitNo.getOperand(0)), + DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), + BitNo.getOperand(1))); + else + BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo); + } + + return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo); +} + /// Helper for creating a X86ISD::SETCC node. static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { @@ -23303,7 +23650,7 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { return true; // We never want to use both SQRT and RSQRT instructions for the same input. - if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) + if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; if (VT.isVector()) @@ -23439,7 +23786,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, // Only perform this transform if CMOV is supported otherwise the select // below will become a branch. - if (!Subtarget.hasCMov()) + if (!Subtarget.canUseCMOV()) return SDValue(); // fold (sdiv X, pow2) @@ -23485,9 +23832,8 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. -static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG, - SDValue &X86CC) { +static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, + SelectionDAG &DAG, X86::CondCode &X86CC) { assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); @@ -23538,30 +23884,24 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (!Src.getNode()) return SDValue(); - // If Src is i8, promote it to i32 with any_extend. There is no i8 BT - // instruction. Since the shift amount is in-range-or-undefined, we know - // that doing a bittest on the i32 value is ok. We extend to i32 because - // the encoding for the i16 version is larger than the i32 version. - // Also promote i16 to i32 for performance / code size reason. - if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) - Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src); + // Remove any bit flip. + if (isBitwiseNot(Src)) { + Src = Src.getOperand(0); + CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ; + } - // See if we can use the 32-bit instruction instead of the 64-bit one for a - // shorter encoding. Since the former takes the modulo 32 of BitNo and the - // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is - // known to be zero. - if (Src.getValueType() == MVT::i64 && - DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) - Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + // Attempt to create the X86ISD::BT node. + if (SDValue BT = getBT(Src, BitNo, dl, DAG)) { + X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return BT; + } - // If the operand types disagree, extend the shift amount to match. Since - // BT ignores high bits (like shifts) we can use anyextend. - if (Src.getValueType() != BitNo.getValueType()) - BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); + return SDValue(); +} - X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, - dl, MVT::i8); - return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); +// Check if pre-AVX condcode can be performed by a single FCMP op. +static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) { + return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ); } /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask @@ -23831,7 +24171,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. - if (SSECC >= 8) { + if (!cheapX86FSETCC_SSE(Cond)) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; @@ -23996,10 +24336,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); - if (VT == MVT::v32i16 || VT == MVT::v64i8) { - assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!"); + // Break 512-bit integer vector compare into smaller ones. + // TODO: Try harder to use VPCMPx + VPMOV2x? + if (VT.is512BitVector()) return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); - } // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid // not-of-PCMPEQ: @@ -24117,12 +24457,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. - SDValue SB; - if (FlipSigns) { - SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64); - } else { - SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64); - } + SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL + : 0x0000000080000000ULL, + dl, MVT::v2i64); + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB); @@ -24261,8 +24599,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC)) + X86::CondCode X86CondCode; + if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) { + X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); return BT; + } } // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0. @@ -24527,6 +24868,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op1.getSimpleValueType(); SDValue CC; + if (isSoftFP16(VT)) + return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, + DAG.getBitcast(MVT::i16, Op1), + DAG.getBitcast(MVT::i16, Op2))); + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. @@ -24591,7 +24937,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } - if (Cond.getOpcode() == ISD::SETCC) { + if (Cond.getOpcode() == ISD::SETCC && + !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the @@ -24608,6 +24955,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y + // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x + // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { @@ -24624,7 +24973,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); }; - if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) && + if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) && ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { // Keep Cmp. @@ -24652,7 +25001,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Sub.getValue(1)); return DAG.getNode(ISD::OR, DL, VT, SBB, Y); - } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && + } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { SDValue Src1, Src2; @@ -24688,6 +25037,22 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y } + } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) && + Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) && + ((CondCode == X86::COND_S) || // smin(x, 0) + (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0) + // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x + // + // If the comparison is testing for a positive value, we have to invert + // the sign bit mask, so only do that transform if the target has a + // bitwise 'and not' instruction (the invert is free). + // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x + unsigned ShCt = VT.getSizeInBits() - 1; + SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT); + SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt); + if (CondCode == X86::COND_G) + Shift = DAG.getNOT(DL, Shift, VT); + return DAG.getNode(ISD::AND, DL, VT, Shift, Op1); } } @@ -24707,7 +25072,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && - !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack? + !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || @@ -24734,9 +25099,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue BTCC; - if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) { - CC = BTCC; + X86::CondCode X86CondCode; + if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { + CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); Cond = BT; AddTest = false; } @@ -24788,7 +25153,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // legal, but EmitLoweredSelect() can not deal with these extensions // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 - if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || + if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) || (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && !X86::mayFoldLoad(Op2, Subtarget))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); @@ -25153,16 +25518,20 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && !Subtarget.hasBWI())) { SmallVector CatOps; - if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) + if (StoredVal.hasOneUse() && + collectConcatOps(StoredVal.getNode(), CatOps, DAG)) return splitVectorStore(St, DAG); return SDValue(); } + if (StoreVT.is32BitVector()) + return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && - "Unexpected VT"); + assert(StoreVT.is64BitVector() && "Unexpected VT"); assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == - TargetLowering::TypeWidenVector && "Unexpected type action!"); + TargetLowering::TypeWidenVector && + "Unexpected type action!"); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, @@ -25247,8 +25616,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); SDLoc dl(Op); + // Bail out when we don't have native compare instructions. if (Cond.getOpcode() == ISD::SETCC && - Cond.getOperand(0).getValueType() != MVT::f128) { + Cond.getOperand(0).getValueType() != MVT::f128 && + !isSoftFP16(Cond.getOperand(0).getValueType())) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -25647,116 +26018,116 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { - SmallVector Elts; - unsigned NumElts = SrcOp->getNumOperands(); - + unsigned ShiftOpc; switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: - for (unsigned i = 0; i != NumElts; ++i) { - SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->isUndef()) { - // Must produce 0s in the correct bits. - Elts.push_back(DAG.getConstant(0, dl, ElementType)); - continue; - } - auto *ND = cast(CurrentOp); - const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); - } + ShiftOpc = ISD::SHL; break; case X86ISD::VSRLI: - for (unsigned i = 0; i != NumElts; ++i) { - SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->isUndef()) { - // Must produce 0s in the correct bits. - Elts.push_back(DAG.getConstant(0, dl, ElementType)); - continue; - } - auto *ND = cast(CurrentOp); - const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); - } + ShiftOpc = ISD::SRL; break; case X86ISD::VSRAI: - for (unsigned i = 0; i != NumElts; ++i) { - SDValue CurrentOp = SrcOp->getOperand(i); - if (CurrentOp->isUndef()) { - // All shifted in bits must be the same so use 0. - Elts.push_back(DAG.getConstant(0, dl, ElementType)); - continue; - } - auto *ND = cast(CurrentOp); - const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); - } + ShiftOpc = ISD::SRA; break; } - return DAG.getBuildVector(VT, dl, Elts); + SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT); + if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt})) + return C; } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getTargetConstant(ShiftAmt, dl, MVT::i8)); } -/// Handle vector element shifts where the shift amount may or may not be a -/// constant. Takes immediate version of shift as input. -/// TODO: Replace with vector + (splat) idx to avoid extract_element nodes. +/// Handle vector element shifts by a splat shift amount static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, - SDValue SrcOp, SDValue ShAmt, + SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - MVT SVT = ShAmt.getSimpleValueType(); - assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); - - // Change opcode to non-immediate version. - Opc = getTargetVShiftUniformOpcode(Opc, true); - - // Need to build a vector containing shift amount. - // SSE/AVX packed shifts only use the lower 64-bit of the shift count. - // +====================+============+=======================================+ - // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as | - // +====================+============+=======================================+ - // | i64 | Yes, No | Use ShAmt as lowest elt | - // | i32 | Yes | zero-extend in-reg | - // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg | - // | (i32 zext(i16/i8)) | No | byte-shift-in-reg | - // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) | - // +====================+============+=======================================+ - - if (SVT == MVT::i64) - ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); - else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND && - ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 || - ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) { + MVT AmtVT = ShAmt.getSimpleValueType(); + assert(AmtVT.isVector() && "Vector shift type mismatch"); + assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && + "Illegal vector splat index"); + + // Move the splat element to the bottom element. + if (ShAmtIdx != 0) { + SmallVector Mask(AmtVT.getVectorNumElements(), -1); + Mask[0] = ShAmtIdx; + ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask); + } + + // Peek through any zext node if we can get back to a 128-bit source. + if (AmtVT.getScalarSizeInBits() == 64 && + (ShAmt.getOpcode() == ISD::ZERO_EXTEND || + ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && + ShAmt.getOperand(0).getValueType().isSimple() && + ShAmt.getOperand(0).getValueType().is128BitVector()) { ShAmt = ShAmt.getOperand(0); - MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16; - ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt); - if (Subtarget.hasSSE41()) + AmtVT = ShAmt.getSimpleValueType(); + } + + // See if we can mask off the upper elements using the existing source node. + // The shift uses the entire lower 64-bits of the amount vector, so no need to + // do this for vXi64 types. + bool IsMasked = false; + if (AmtVT.getScalarSizeInBits() < 64) { + if (ShAmt.getOpcode() == ISD::BUILD_VECTOR || + ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // If the shift amount has come from a scalar, then zero-extend the scalar + // before moving to the vector. + ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32); + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); + ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt); + AmtVT = MVT::v4i32; + IsMasked = true; + } else if (ShAmt.getOpcode() == ISD::AND) { + // See if the shift amount is already masked (e.g. for rotation modulo), + // then we can zero-extend it by setting all the other mask elements to + // zero. + SmallVector MaskElts( + AmtVT.getVectorNumElements(), + DAG.getConstant(0, dl, AmtVT.getScalarType())); + MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType()); + SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts); + if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT, + {ShAmt.getOperand(1), Mask}))) { + ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask); + IsMasked = true; + } + } + } + + // Extract if the shift amount vector is larger than 128-bits. + if (AmtVT.getSizeInBits() > 128) { + ShAmt = extract128BitVector(ShAmt, 0, DAG, dl); + AmtVT = ShAmt.getSimpleValueType(); + } + + // Zero-extend bottom element to v2i64 vector type, either by extension or + // shuffle masking. + if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) { + if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST || + ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) { + ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt); + } else if (Subtarget.hasSSE41()) { ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); - else { + } else { SDValue ByteShift = DAG.getTargetConstant( - (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); + (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); } - } else if (Subtarget.hasSSE41() && - ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), - MVT::v2i64, ShAmt); - } else { - SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), - DAG.getUNDEF(SVT)}; - ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } + // Change opcode to non-immediate version. + Opc = getTargetVShiftUniformOpcode(Opc, true); + // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); @@ -25907,8 +26278,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after // prologue to RBP in the parent function. - const X86Subtarget &Subtarget = - static_cast(DAG.getSubtarget()); + const X86Subtarget &Subtarget = DAG.getSubtarget(); if (Subtarget.is64Bit()) return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); @@ -26444,6 +26814,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case VSHIFT: { SDValue SrcOp = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); + assert(ShAmt.getValueType() == MVT::i32 && + "Unexpected VSHIFT amount type"); // Catch shift-by-constant. if (auto *CShAmt = dyn_cast(ShAmt)) @@ -26451,8 +26823,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getSimpleValueType(), SrcOp, CShAmt->getZExtValue(), DAG); + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - SrcOp, ShAmt, Subtarget, DAG); + SrcOp, ShAmt, 0, Subtarget, DAG); } case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); @@ -27411,6 +27784,30 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_atomic_bts: + case Intrinsic::x86_atomic_btc: + case Intrinsic::x86_atomic_btr: { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue Chain = Op.getOperand(0); + SDValue Op1 = Op.getOperand(2); + SDValue Op2 = Op.getOperand(3); + unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS + : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC + : X86ISD::LBTR; + SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32); + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + SDValue Res = + DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), + {Chain, Op1, Op2, Size}, VT, MMO); + Chain = Res.getValue(1); + Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); + unsigned Imm = cast(Op2)->getZExtValue(); + if (Imm) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getShiftAmountConstant(Imm, VT, DL)); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); + } } return SDValue(); } @@ -28394,11 +28791,27 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } -static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + // For AVX1 cases, split to use legal ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return splitVectorIntBinary(Op, DAG); + + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + + // Default to expand. + return SDValue(); +} + +static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - // For AVX1 cases, split to use legal ops (everything but v4i64). - if (VT.getScalarType() != MVT::i64 && VT.is256BitVector()) + // For AVX1 cases, split to use legal ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG); if (VT == MVT::v32i16 || VT == MVT::v64i8) @@ -29188,19 +29601,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); - unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); - if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) { - if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { - MVT EltVT = VT.getVectorElementType(); - assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); - if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); - else if (EltVT.bitsLT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); - - return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG); - } + int BaseShAmtIdx = -1; + if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) { + if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) + return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx, + Subtarget, DAG); // vXi8 shifts - shift as v8i16 + mask result. if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) || @@ -29212,13 +29618,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) { unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL); unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false); - BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); // Create the mask using vXi16 shifts. For shift-rights we need to move // the upper byte down before splatting the vXi8 mask. SDValue BitMask = DAG.getConstant(-1, dl, ExtVT); BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask, - BaseShAmt, Subtarget, DAG); + BaseShAmt, BaseShAmtIdx, Subtarget, DAG); if (Opcode != ISD::SHL) BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask, 8, DAG); @@ -29228,7 +29633,7 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, DAG.getBitcast(ExtVT, R), BaseShAmt, - Subtarget, DAG); + BaseShAmtIdx, Subtarget, DAG); Res = DAG.getBitcast(VT, Res); Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask); @@ -29236,8 +29641,9 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask) // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW. SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT); - SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, - BaseShAmt, Subtarget, DAG); + SignMask = + getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt, + BaseShAmtIdx, Subtarget, DAG); SignMask = DAG.getBitcast(VT, SignMask); Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask); @@ -29247,23 +29653,6 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, } } - // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. - if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && - Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { - Amt = Amt.getOperand(0); - unsigned Ratio = 64 / Amt.getScalarValueSizeInBits(); - std::vector Vals(Ratio); - for (unsigned i = 0; i != Ratio; ++i) - Vals[i] = Amt.getOperand(i); - for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) { - for (unsigned j = 0; j != Ratio; ++j) - if (Vals[j] != Amt.getOperand(i + j)) - return SDValue(); - } - - if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) - return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); - } return SDValue(); } @@ -29843,8 +30232,8 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || - VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || - VT == MVT::v16i32) && + VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || + VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!"); // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw. @@ -29867,7 +30256,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, // Split 256-bit integers on XOP/pre-AVX2 targets. // Split 512-bit integers on non 512-bit BWI targets. - if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) || + if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) || !Subtarget.hasAVX2())) || (VT.is512BitVector() && !Subtarget.useBWIRegs() && EltSizeInBits < 32)) { @@ -29878,18 +30267,18 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z)) if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) { - if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) { + int ScalarAmtIdx = -1; + if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) { // Uniform vXi16 funnel shifts can be efficiently handled by default. if (EltSizeInBits == 16) return SDValue(); SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); - ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32); - Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget, - DAG); - Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget, - DAG); + Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, + ScalarAmtIdx, Subtarget, DAG); + Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, + ScalarAmtIdx, Subtarget, DAG); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); } } @@ -30079,18 +30468,20 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // Attempt to fold as unpack(x,x) << zext(splat(y)): // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). - // TODO: Handle vXi16 cases on all targets. - if (EltSizeInBits == 8 || EltSizeInBits == 32 || - (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) { - if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) { + if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) { + int BaseRotAmtIdx = -1; + if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) { + if (EltSizeInBits == 16 && Subtarget.hasSSE41()) { + unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR; + return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); + } unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI; SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); - BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32); Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt, - Subtarget, DAG); + BaseRotAmtIdx, Subtarget, DAG); Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt, - Subtarget, DAG); + BaseRotAmtIdx, Subtarget, DAG); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); } } @@ -30273,14 +30664,15 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit(); + return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit(); if (OpWidth == 128) - return Subtarget.hasCmpxchg16b(); + return Subtarget.canUseCMPXCHG16B(); return false; } -bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { +TargetLoweringBase::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); bool NoImplicitFloatOps = @@ -30288,9 +30680,10 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && !Subtarget.useSoftFloat() && !NoImplicitFloatOps && (Subtarget.hasSSE1() || Subtarget.hasX87())) - return false; + return AtomicExpansionKind::None; - return needsCmpXchgNb(MemType); + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand + : AtomicExpansionKind::None; } // Note: this turns large loads into lock cmpxchg8b/16b. @@ -30313,6 +30706,65 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { : AtomicExpansionKind::None; } +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const { + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + if (AI->use_empty()) + return AtomicExpansionKind::None; + + // If the atomicrmw's result is used by a single bit AND, we may use + // bts/btr/btc instruction for these operations. + auto *C1 = dyn_cast(AI->getValOperand()); + Instruction *I = AI->user_back(); + if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And || + AI->getParent() != I->getParent()) + return AtomicExpansionKind::CmpXChg; + // The following instruction must be a AND single bit. + auto *C2 = dyn_cast(I->getOperand(1)); + unsigned Bits = AI->getType()->getPrimitiveSizeInBits(); + if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue())) + return AtomicExpansionKind::CmpXChg; + + if (AI->getOperation() == AtomicRMWInst::And) + return ~C1->getValue() == C2->getValue() + ? AtomicExpansionKind::BitTestIntrinsic + : AtomicExpansionKind::CmpXChg; + + return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic + : AtomicExpansionKind::CmpXChg; +} + +void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { + IRBuilder<> Builder(AI); + Intrinsic::ID IID = Intrinsic::not_intrinsic; + switch (AI->getOperation()) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Or: + IID = Intrinsic::x86_atomic_bts; + break; + case AtomicRMWInst::Xor: + IID = Intrinsic::x86_atomic_btc; + break; + case AtomicRMWInst::And: + IID = Intrinsic::x86_atomic_btr; + break; + } + Instruction *I = AI->user_back(); + LLVMContext &Ctx = AI->getContext(); + unsigned Imm = + countTrailingZeros(cast(I->getOperand(1))->getZExtValue()); + Function *BitTest = + Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); + Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), + Type::getInt8PtrTy(Ctx)); + Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); + I->replaceAllUsesWith(Result); + I->eraseFromParent(); + AI->eraseFromParent(); +} + TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; @@ -30337,10 +30789,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: - // If the atomicrmw's result isn't actually used, we can just add a "lock" - // prefix to a normal instruction for these operations. - return !AI->use_empty() ? AtomicExpansionKind::CmpXChg - : AtomicExpansionKind::None; + return shouldExpandLogicAtomicRMWInIR(AI); case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -31552,16 +32001,12 @@ SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op, // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; - Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); - SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); - - return NOOP; + return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); } // Custom split CVTPS2PH with wide types. @@ -31710,8 +32155,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: - case ISD::UMIN: return LowerMINMAX(Op, DAG); + case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG); case ISD::ABS: return LowerABS(Op, Subtarget, DAG); + case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); @@ -31807,9 +32253,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } - case X86ISD::VPMADDWD: - case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/VPMADDWD by widening. + case X86ISD::VPMADDWD: { + // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -32462,7 +32907,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; - assert((!Regs64bit || Subtarget.hasCmpxchg16b()) && + assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; @@ -32821,6 +33266,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LOR) NODE_NAME_CASE(LXOR) NODE_NAME_CASE(LAND) + NODE_NAME_CASE(LBTS) + NODE_NAME_CASE(LBTC) + NODE_NAME_CASE(LBTR) NODE_NAME_CASE(VZEXT_MOVL) NODE_NAME_CASE(VZEXT_LOAD) NODE_NAME_CASE(VEXTRACT_STORE) @@ -33041,7 +33489,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SCALEF_RND) NODE_NAME_CASE(SCALEFS) NODE_NAME_CASE(SCALEFS_RND) - NODE_NAME_CASE(AVG) NODE_NAME_CASE(MULHRS) NODE_NAME_CASE(SINT_TO_FP_RND) NODE_NAME_CASE(UINT_TO_FP_RND) @@ -33222,7 +33669,6 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const { bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. - case X86ISD::AVG: case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: @@ -33418,6 +33864,20 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { return !(VT1 == MVT::i32 && VT2 == MVT::i16); } +bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode, + EVT VT) const { + // TODO: This is too general. There are cases where pre-AVX512 codegen would + // benefit. The transform may also be profitable for scalar code. + if (!Subtarget.hasAVX512()) + return false; + if (!Subtarget.hasVLX() && !VT.is512BitVector()) + return false; + if (!VT.isVector()) + return false; + + return true; +} + /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -33460,6 +33920,16 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { return TargetLowering::areJTsAllowed(Fn); } +MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context, + EVT ConditionVT) const { + // Avoid 8 and 16 bit types because they increase the chance for unnecessary + // zero-extensions. + if (ConditionVT.getSizeInBits() < 32) + return MVT::i32; + return TargetLoweringBase::getPreferredSwitchConditionType(Context, + ConditionVT); +} + //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// @@ -33871,6 +34341,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, // conditional jump around it. static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { + case X86::CMOV_FR16: case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: @@ -34090,7 +34561,7 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] - Register DestReg = FirstCMOV.getOperand(0).getReg(); + Register DestReg = SecondCascadedCMOV.getOperand(0).getReg(); Register Op1Reg = FirstCMOV.getOperand(1).getReg(); Register Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = @@ -34103,11 +34574,6 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // The second SecondInsertedMBB provides the same incoming value as the // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); - // Copy the PHI result to the register defined by the second CMOV. - BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, - TII->get(TargetOpcode::COPY), - SecondCascadedCMOV.getOperand(0).getReg()) - .addReg(FirstCMOV.getOperand(0).getReg()); // Now remove the CMOVs. FirstCMOV.eraseFromParent(); @@ -35546,6 +36012,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); + case X86::CMOV_FR16: + case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: @@ -36116,6 +36584,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } + case X86ISD::AND: { + if (Op.getResNo() == 0) { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known &= Known2; + } + break; + } case X86ISD::ANDNP: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); @@ -36257,6 +36734,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.setAllZero(); break; } + case X86ISD::VBROADCAST_LOAD: { + APInt UndefElts; + SmallVector EltBits; + if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits, + /*AllowWholeUndefs*/ false, + /*AllowPartialUndefs*/ false)) { + Known.Zero.setAllBits(); + Known.One.setAllBits(); + for (unsigned I = 0; I != NumElts; ++I) { + if (!DemandedElts[I]) + continue; + if (UndefElts[I]) { + Known.resetAll(); + break; + } + KnownBits Known2 = KnownBits::makeConstant(EltBits[I]); + Known = KnownBits::commonBits(Known, Known2); + } + return; + } + break; + } } // Handle target shuffles. @@ -37113,9 +37612,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, unsigned NumRootElts = RootVT.getVectorNumElements(); // Canonicalize shuffle input op to the requested type. - // TODO: Support cases where Op is smaller than VT. auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { - if (VT.getSizeInBits() < Op.getValueSizeInBits()) + if (VT.getSizeInBits() > Op.getValueSizeInBits()) + Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits()); + else if (VT.getSizeInBits() < Op.getValueSizeInBits()) Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits()); return DAG.getBitcast(VT, Op); }; @@ -37129,8 +37629,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); - assert(VT1.getSizeInBits() == RootSizeInBits && - VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"); + assert((RootSizeInBits % VT1.getSizeInBits()) == 0 && + (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch"); SDValue Res; @@ -37157,12 +37657,13 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - // If we are shuffling a broadcast (and not introducing zeros) then - // we can just use the broadcast directly. This works for smaller broadcast - // elements as well as they already repeat across each mask element - if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) && + // If we are shuffling a splat (and not introducing zeros) then we can just + // use it directly. This works for smaller elements as well as they already + // repeat across each mask element. + if (UnaryShuffle && !isAnyZero(BaseMask) && + V1.getValueSizeInBits() >= RootSizeInBits && (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && - V1.getValueSizeInBits() >= RootSizeInBits) { + DAG.isSplatValue(V1, /*AllowUndefs*/ false)) { return CanonicalizeShuffleInput(RootVT, V1); } @@ -37543,7 +38044,11 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (RootVT.is128BitVector() && Subtarget.hasVLX())) && (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { - if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE) + // Bail if this was already a truncation or PACK node. + // We sometimes fail to match PACK if we demand known undef elements. + if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE || + Root.getOpcode() == X86ISD::PACKSS || + Root.getOpcode() == X86ISD::PACKUS)) return SDValue(); // Nothing to do! ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); @@ -37852,6 +38357,12 @@ static SDValue combineX86ShuffleChainWithExtract( unsigned RootSizeInBits = RootVT.getSizeInBits(); assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask"); + // Bail if we have any smaller inputs. + if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) { + return Input.getValueSizeInBits() < RootSizeInBits; + })) + return SDValue(); + SmallVector WideInputs(Inputs.begin(), Inputs.end()); SmallVector Offsets(NumInputs, 0); @@ -37894,16 +38405,6 @@ static SDValue combineX86ShuffleChainWithExtract( })) return SDValue(); - for (SDValue &NewInput : WideInputs) { - assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && - "Shuffle vector size mismatch"); - if (WideSizeInBits > NewInput.getValueSizeInBits()) - NewInput = widenSubVector(NewInput, false, Subtarget, DAG, - SDLoc(NewInput), WideSizeInBits); - assert(WideSizeInBits == NewInput.getValueSizeInBits() && - "Unexpected subvector extraction"); - } - // Create new mask for larger type. for (unsigned i = 1; i != NumInputs; ++i) Offsets[i] += i * Scale * NumMaskElts; @@ -37928,7 +38429,10 @@ static SDValue combineX86ShuffleChainWithExtract( // Attempt to combine wider chain. // TODO: Can we use a better Root? - SDValue WideRoot = WideInputs[0]; + SDValue WideRoot = WideInputs.front().getValueSizeInBits() > + WideInputs.back().getValueSizeInBits() + ? WideInputs.front() + : WideInputs.back(); if (SDValue WideShuffle = combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, HasVariableMask, AllowVariableCrossLaneMask, @@ -38267,9 +38771,9 @@ static SDValue combineX86ShufflesRecursively( assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); - assert(Root.getSimpleValueType().isVector() && - "Shuffles operate on vector types!"); - unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); + MVT RootVT = Root.getSimpleValueType(); + assert(RootVT.isVector() && "Shuffles operate on vector types!"); + unsigned RootSizeInBits = RootVT.getSizeInBits(); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. @@ -38298,16 +38802,27 @@ static SDValue combineX86ShufflesRecursively( APInt OpUndef, OpZero; APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); - if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, - OpZero, DAG, Depth, false)) - return SDValue(); - - // Shuffle inputs must not be larger than the shuffle result. - // TODO: Relax this for single input faux shuffles (trunc/extract_subvector). - if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { - return OpInput.getValueSizeInBits() > VT.getSizeInBits(); - })) + if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, + OpZero, DAG, Depth, false)) { + // Shuffle inputs must not be larger than the shuffle result. + // TODO: Relax this for single input faux shuffles (e.g. trunc). + if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { + return OpInput.getValueSizeInBits() > VT.getSizeInBits(); + })) + return SDValue(); + } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && + !isNullConstant(Op.getOperand(1))) { + SDValue SrcVec = Op.getOperand(0); + int ExtractIdx = Op.getConstantOperandVal(1); + unsigned NumElts = VT.getVectorNumElements(); + OpInputs.assign({SrcVec}); + OpMask.assign(NumElts, SM_SentinelUndef); + std::iota(OpMask.begin(), OpMask.end(), ExtractIdx); + OpZero = OpUndef = APInt::getNullValue(NumElts); + } else { return SDValue(); + } // If the shuffle result was smaller than the root, we need to adjust the // mask indices and pad the mask with undefs. @@ -38467,13 +38982,12 @@ static SDValue combineX86ShufflesRecursively( // Handle the all undef/zero/ones cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) - return DAG.getUNDEF(Root.getValueType()); + return DAG.getUNDEF(RootVT); if (all_of(Mask, [](int Idx) { return Idx < 0; })) - return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, - SDLoc(Root)); + return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root)); if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) && none_of(Mask, [](int M) { return M == SM_SentinelZero; })) - return getOnesVector(Root.getValueType(), DAG, SDLoc(Root)); + return getOnesVector(RootVT, DAG, SDLoc(Root)); assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= IsOpVariableMask; @@ -38533,7 +39047,7 @@ static SDValue combineX86ShufflesRecursively( // NOTE: This will update the Ops and Mask. if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) - return DAG.getBitcast(Root.getValueType(), HOp); + return DAG.getBitcast(RootVT, HOp); // Try to refine our inputs given our knowledge of target shuffle mask. for (auto I : enumerate(Ops)) { @@ -38578,6 +39092,8 @@ static SDValue combineX86ShufflesRecursively( // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? // Widen any subvector shuffle inputs we've collected. + // TODO: Remove this to avoid generating temporary nodes, we should only + // widen once combineX86ShuffleChain has found a match. if (any_of(Ops, [RootSizeInBits](SDValue Op) { return Op.getValueSizeInBits() < RootSizeInBits; })) { @@ -38823,8 +39339,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SDValue N0 = V.getOperand(0); SDValue N1 = V.getOperand(1); unsigned Imm = V.getConstantOperandVal(2); - const X86Subtarget &Subtarget = - static_cast(DAG.getSubtarget()); + const X86Subtarget &Subtarget = DAG.getSubtarget(); if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); @@ -38869,21 +39384,24 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ShuffleVT = N.getValueType(); - auto IsMergeableWithShuffle = [](SDValue Op) { + auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) { // AllZeros/AllOnes constants are freely shuffled and will peek through // bitcasts. Other constant build vectors do not peek through bitcasts. Only // merge with target shuffles if it has one use so shuffle combining is - // likely to kick in. + // likely to kick in. Shuffles of splats are expected to be removed. return ISD::isBuildVectorAllOnes(Op.getNode()) || ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) || - (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()); + (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) || + (FoldLoad && isShuffleFoldableLoad(Op)) || + DAG.isSplatValue(Op, /*AllowUndefs*/ false); }; auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { // Ensure we only shuffle whole vector src elements, unless its a logical // binops where we can more aggressively move shuffles from dst to src. return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR || + BinOp == X86ISD::ANDNP || (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits()); }; @@ -38913,7 +39431,8 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); - if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) { + if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) || + IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op01 = DAG.getBitcast(ShuffleVT, Op01); @@ -39054,6 +39573,11 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SmallVector Mask; unsigned Opcode = N.getOpcode(); + // FIXME: Remove this after we support vector FP16 + if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(), + Subtarget)) + return SDValue(); + if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; @@ -39471,7 +39995,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); SmallVector SubOps; - if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2) + if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2) return SubOps[Idx & 1]; unsigned NumElts = Src.getValueType().getVectorNumElements(); if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && @@ -39581,7 +40105,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; - } else if (KnownUndef0[i] || KnownZero0[i]) { + } + + if (KnownUndef0[i] || KnownZero0[i]) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; @@ -40016,16 +40542,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // Simplify source operands based on shuffle mask. // TODO - merge this into combineX86ShufflesRecursively. - APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, - DCI)) + if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI)) return SDValue(N, 0); // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). // Perform this after other shuffle combines to allow inner shuffles to be // combined away first. - if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N))) + if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl)) return BinOp; } @@ -40212,6 +40736,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( Depth + 1)) return true; + // Fold shift(0,x) -> 0 + if (DemandedElts.isSubsetOf(KnownZero)) + return TLO.CombineTo( + Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); + // Aggressively peek through ops to get at the demanded elts. if (!DemandedElts.isAllOnes()) if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( @@ -40232,9 +40761,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; + + // Fold shift(0,x) -> 0 + if (DemandedElts.isSubsetOf(LHSZero)) + return TLO.CombineTo( + Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; + KnownZero = LHSZero; break; } @@ -40316,6 +40852,57 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( KnownZero.setHighBits(ShiftAmt); break; } + case X86ISD::ANDNP: { + // ANDNP = (~LHS & RHS); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { + APInt UndefElts; + SmallVector EltBits; + int NumElts = VT.getVectorNumElements(); + int EltSizeInBits = VT.getScalarSizeInBits(); + APInt OpBits = APInt::getAllOnes(EltSizeInBits); + APInt OpElts = DemandedElts; + if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, + EltBits)) { + OpBits.clearAllBits(); + OpElts.clearAllBits(); + for (int I = 0; I != NumElts; ++I) + if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero()))) { + OpBits |= Invert ? ~EltBits[I] : EltBits[I]; + OpElts.setBit(I); + } + } + return std::make_pair(OpBits, OpElts); + }; + std::pair DemandLHS = GetDemandedMasks(RHS); + std::pair DemandRHS = GetDemandedMasks(LHS, true); + + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero, + TLO, Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero, + TLO, Depth + 1)) + return true; + + if (!DemandedElts.isAllOnes()) { + SDValue NewLHS = SimplifyMultipleUseDemandedBits( + LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1); + SDValue NewRHS = SimplifyMultipleUseDemandedBits( + RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1); + if (NewLHS || NewRHS) { + NewLHS = NewLHS ? NewLHS : LHS; + NewRHS = NewRHS ? NewRHS : RHS; + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS)); + } + } + break; + } case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: { SDValue Src = Op.getOperand(0); @@ -40620,7 +41207,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::UNPCKH: case X86ISD::BLENDI: // Integer ops. - case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -40651,10 +41237,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } } - // For broadcasts, unless we *only* demand the 0'th element, + // For splats, unless we *only* demand the 0'th element, // stop attempts at simplification here, we aren't going to improve things, // this is better than any potential shuffle. - if (isTargetShuffleSplat(Op) && !DemandedElts.isOne()) + if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) return false; // Get target/faux shuffle mask. @@ -40770,20 +41356,31 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( KnownBits KnownOp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + + // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast. // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); - if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, - TLO, Depth + 1)) + APInt DemandedMaskLHS = APInt::getAllOnes(64); + APInt DemandedMaskRHS = APInt::getAllOnes(64); + + bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512(); + if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS)) + DemandedMaskLHS = DemandedMask; + if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS)) + DemandedMaskRHS = DemandedMask; + + if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts, + KnownOp, TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, - TLO, Depth + 1)) + if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts, + KnownOp, TLO, Depth + 1)) return true; // Aggressively peek through ops to get at the demanded low bits. SDValue DemandedLHS = SimplifyMultipleUseDemandedBits( - LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1); SDValue DemandedRHS = SimplifyMultipleUseDemandedBits( - RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1); if (DemandedLHS || DemandedRHS) { DemandedLHS = DemandedLHS ? DemandedLHS : LHS; DemandedRHS = DemandedRHS ? DemandedRHS : RHS; @@ -41084,7 +41681,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero = KnownZero.zextOrSelf(BitWidth); + Known.Zero = KnownZero.zext(BitWidth); Known.Zero.setHighBits(BitWidth - NumElts); // MOVMSK only uses the MSB from each vector element. @@ -41291,12 +41888,8 @@ bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op, switch (Opc) { case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: - // TODO: Permit vXi64 types on 32-bit targets. - if (isTypeLegal(Op.getValueType().getVectorElementType())) { - UndefElts = APInt::getNullValue(NumElts); - return true; - } - return false; + UndefElts = APInt::getNullValue(NumElts); + return true; } return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts, @@ -42840,10 +43433,29 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, return SDValue(); SDLoc DL(ExtElt); + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned EltSizeInBits = VecVT.getScalarSizeInBits(); + + // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits. + auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) { + if (V.getValueType() == MVT::v4i8) { + if (ZeroExtend && Subtarget.hasSSE41()) { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + DAG.getConstant(0, DL, MVT::v4i32), + DAG.getBitcast(MVT::i32, V), + DAG.getIntPtrConstant(0, DL)); + return DAG.getBitcast(MVT::v16i8, V); + } + V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V, + ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8) + : DAG.getUNDEF(MVT::v4i8)); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V, + DAG.getUNDEF(MVT::v8i8)); + }; // vXi8 mul reduction - promote to vXi16 mul reduction. if (Opc == ISD::MUL) { - unsigned NumElts = VecVT.getVectorNumElements(); if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts)) return SDValue(); if (VecVT.getSizeInBits() >= 128) { @@ -42858,11 +43470,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi); } } else { - if (VecVT == MVT::v4i8) - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, - DAG.getUNDEF(MVT::v4i8)); - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, - DAG.getUNDEF(MVT::v8i8)); + Rdx = WidenToV16I8(Rdx, false); Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8)); Rdx = DAG.getBitcast(MVT::v8i16, Rdx); } @@ -42882,24 +43490,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, // vXi8 add reduction - sub 128-bit vector. if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { - if (VecVT == MVT::v4i8) { - // Pad with zero. - if (Subtarget.hasSSE41()) { - Rdx = DAG.getBitcast(MVT::i32, Rdx); - Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, - DAG.getConstant(0, DL, MVT::v4i32), Rdx, - DAG.getIntPtrConstant(0, DL)); - Rdx = DAG.getBitcast(MVT::v16i8, Rdx); - } else { - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, - DAG.getConstant(0, DL, VecVT)); - } - } - if (Rdx.getValueType() == MVT::v8i8) { - // Pad with undef. - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, - DAG.getUNDEF(MVT::v8i8)); - } + Rdx = WidenToV16I8(Rdx, true); Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, DAG.getConstant(0, DL, MVT::v16i8)); Rdx = DAG.getBitcast(MVT::v16i8, Rdx); @@ -42907,8 +43498,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, } // Must be a >=128-bit vector with pow2 elements. - if ((VecVT.getSizeInBits() % 128) != 0 || - !isPowerOf2_32(VecVT.getVectorNumElements())) + if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts)) return SDValue(); // vXi8 add reduction - sum lo/hi halves then use PSADBW. @@ -42931,6 +43521,48 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } + // See if we can use vXi8 PSADBW add reduction for larger zext types. + // If the source vector values are 0-255, then we can use PSADBW to + // sum+zext v8i8 subvectors to vXi64, then perform the reduction. + // TODO: See if its worth avoiding vXi16/i32 truncations? + if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 && + DAG.computeKnownBits(Rdx).getMaxValue().ule(255) && + (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND || + Subtarget.hasAVX512())) { + EVT ByteVT = VecVT.changeVectorElementType(MVT::i8); + Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx); + if (ByteVT.getSizeInBits() < 128) + Rdx = WidenToV16I8(Rdx, true); + + // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW. + auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); + SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType()); + return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero); + }; + MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64); + Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder); + + // TODO: We could truncate to vXi16/vXi32 before performing the reduction. + while (Rdx.getValueSizeInBits() > 128) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); + VecVT = Lo.getValueType(); + Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); + } + assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected"); + + if (NumElts > 8) { + SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1}); + Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi); + } + + VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits()); + Rdx = DAG.getBitcast(VecVT, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. if (!shouldUseHorizontalOp(true, DAG, Subtarget)) return SDValue(); @@ -42994,8 +43626,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, uint64_t Idx = CIdx->getZExtValue(); if (UndefVecElts[Idx]) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); - return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()), - dl, VT); + return DAG.getConstant(EltBits[Idx].zext(VT.getScalarSizeInBits()), dl, + VT); } } @@ -43076,29 +43708,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, // but not // i1 = extract_vector_elt t0:1, Constant:i64<2> // since the latter would need its own MOVMSK. - if (CIdx && SrcVT.getScalarType() == MVT::i1) { + if (SrcVT.getScalarType() == MVT::i1) { + bool IsVar = !CIdx; SmallVector BoolExtracts; unsigned ResNo = InputVector.getResNo(); - auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) { + auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa(Use->getOperand(1)) && Use->getOperand(0).getResNo() == ResNo && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); + IsVar |= !isa(Use->getOperand(1)); return true; } return false; }; + // TODO: Can we drop the oneuse check for constant extracts? if (all_of(InputVector->uses(), IsBoolExtract) && - BoolExtracts.size() > 1) { + (IsVar || BoolExtracts.size() > 1)) { EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { for (SDNode *Use : BoolExtracts) { // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask - unsigned MaskIdx = Use->getConstantOperandVal(1); - APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); - SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); + // Mask = 1 << MaskIdx + SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8); + SDValue MaskBit = DAG.getConstant(1, dl, BCVT); + SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx); SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); DCI.CombineTo(Use, Res); @@ -43123,7 +43758,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, auto *LoadVec = dyn_cast(InputVector); if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() && SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() && - !LikelyUsedAsVector) { + !LikelyUsedAsVector && LoadVec->isSimple()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue NewPtr = TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx); @@ -43133,16 +43768,111 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, SDValue Load = DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment, LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo()); - SDValue Chain = Load.getValue(1); - SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)}; - SDValue To[] = {Load, Chain}; - DAG.ReplaceAllUsesOfValuesWith(From, To, 2); - return SDValue(N, 0); + DAG.makeEquivalentMemoryOrdering(LoadVec, Load); + return Load; } return SDValue(); } +// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). +// This is more or less the reverse of combineBitcastvxi1. +static SDValue combineToExtendBoolVectorInReg( + unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && + Opcode != ISD::ANY_EXTEND) + return SDValue(); + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + return SDValue(); + + EVT SVT = VT.getScalarType(); + EVT InSVT = N0.getValueType().getScalarType(); + unsigned EltSizeInBits = SVT.getSizeInBits(); + + // Input type must be extending a bool vector (bit-casted from a scalar + // integer) to legal integer types. + if (!VT.isVector()) + return SDValue(); + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) + return SDValue(); + if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + EVT SclVT = N00.getValueType(); + if (!SclVT.isScalarInteger()) + return SDValue(); + + SDValue Vec; + SmallVector ShuffleMask; + unsigned NumElts = VT.getVectorNumElements(); + assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); + + // Broadcast the scalar integer to the vector elements. + if (NumElts > EltSizeInBits) { + // If the scalar integer is greater than the vector element size, then we + // must split it down into sub-sections for broadcasting. For example: + // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. + // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. + assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); + unsigned Scale = NumElts / EltSizeInBits; + EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); + Vec = DAG.getBitcast(VT, Vec); + + for (unsigned i = 0; i != Scale; ++i) + ShuffleMask.append(EltSizeInBits, i); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); + } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && + (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { + // If we have register broadcast instructions, use the scalar size as the + // element type for the shuffle. Then cast to the wider element type. The + // widened bits won't be used, and this might allow the use of a broadcast + // load. + assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"); + unsigned Scale = EltSizeInBits / NumElts; + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); + ShuffleMask.append(NumElts * Scale, 0); + Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); + Vec = DAG.getBitcast(VT, Vec); + } else { + // For smaller scalar integers, we can simply any-extend it to the vector + // element size (we don't care about the upper bits) and broadcast it to all + // elements. + SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); + ShuffleMask.append(NumElts, 0); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); + } + + // Now, mask the relevant bit in each element. + SmallVector Bits; + for (unsigned i = 0; i != NumElts; ++i) { + int BitIdx = (i % EltSizeInBits); + APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); + Bits.push_back(DAG.getConstant(Bit, DL, SVT)); + } + SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); + Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); + + // Compare against the bitmask and extend the result. + EVT CCVT = VT.changeVectorElementType(MVT::i1); + Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); + Vec = DAG.getSExtOrTrunc(Vec, DL, VT); + + // For SEXT, this is now done, otherwise shift the result down for + // zero-extension. + if (Opcode == ISD::SIGN_EXTEND) + return Vec; + return DAG.getNode(ISD::SRL, DL, VT, Vec, + DAG.getConstant(EltSizeInBits - 1, DL, VT)); +} + /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? @@ -43270,8 +44000,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, SDValue FVal = N->getOperand(2); SmallVector CatOpsT, CatOpsF; if (!TVal.hasOneUse() || !FVal.hasOneUse() || - !collectConcatOps(TVal.getNode(), CatOpsT) || - !collectConcatOps(FVal.getNode(), CatOpsF)) + !collectConcatOps(TVal.getNode(), CatOpsT, DAG) || + !collectConcatOps(FVal.getNode(), CatOpsF, DAG)) return SDValue(); auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, @@ -43360,19 +44090,17 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { /// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) || ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); - // Don't optimize before the condition has been transformed to a legal type - // and don't ever optimize vector selects that map to AVX512 mask-registers. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned BitWidth = Cond.getScalarValueSizeInBits(); - if (BitWidth < 8 || BitWidth > 64) - return SDValue(); + EVT VT = N->getValueType(0); // We can only handle the cases where VSELECT is directly legal on the // subtarget. We custom lower VSELECT nodes with constant conditions and @@ -43384,8 +44112,6 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, // Potentially, we should combine constant-condition vselect nodes // pre-legalization into shuffles and not mark as many types as custom // lowered. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT VT = N->getValueType(0); if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // FIXME: We don't support i16-element blends currently. We could and @@ -43403,6 +44129,11 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, if (VT.is512BitVector()) return SDValue(); + // Don't optimize before the condition has been transformed to a legal type + // and don't ever optimize vector selects that map to AVX512 mask-registers. + if (BitWidth < 8 || BitWidth > 64) + return SDValue(); + auto OnlyUsedAsSelectCond = [](SDValue Cond) { for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); UI != UE; ++UI) @@ -43542,9 +44273,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return V; // Convert vselects with constant condition into shuffles. - if (CondConstantVector && DCI.isBeforeLegalizeOps()) { + if (CondConstantVector && DCI.isBeforeLegalizeOps() && + (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) { SmallVector Mask; - if (createShuffleMaskFromVSELECT(Mask, Cond)) + if (createShuffleMaskFromVSELECT(Mask, Cond, + N->getOpcode() == X86ISD::BLENDV)) return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); } @@ -43565,11 +44298,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // getConstVector sets negative shuffle mask values as undef, so ensure // we hardcode SM_SentinelZero values to zero (0x80). if (CondMask[i] < NumElts) { - LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i]; + LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i]; RHSMask[i] = 0x80; } else { LHSMask[i] = 0x80; - RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i]; + RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i]; } } LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), @@ -43586,7 +44319,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && VT != MVT::f128 && + VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget.hasSSE2() || (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { @@ -43880,7 +44613,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() && - Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorAllZeros(LHS.getNode()) && !ISD::isBuildVectorAllZeros(RHS.getNode())) { // Invert the cond to not(cond) : xor(op,allones)=not(op) @@ -43889,6 +44622,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } + // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might + // get split by legalization. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST && + CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() && + TLI.isTypeLegal(VT.getScalarType())) { + EVT ExtCondVT = VT.changeVectorElementTypeToInteger(); + if (SDValue ExtCond = combineToExtendBoolVectorInReg( + ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) { + ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond); + return DAG.getSelect(DL, VT, ExtCond, LHS, RHS); + } + } + // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); @@ -44301,14 +45047,15 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { + bool FoundAndLSB = false; SDValue Carry = EFLAGS.getOperand(0); while (Carry.getOpcode() == ISD::TRUNCATE || Carry.getOpcode() == ISD::ZERO_EXTEND || - Carry.getOpcode() == ISD::SIGN_EXTEND || - Carry.getOpcode() == ISD::ANY_EXTEND || (Carry.getOpcode() == ISD::AND && - isOneConstant(Carry.getOperand(1)))) + isOneConstant(Carry.getOperand(1)))) { + FoundAndLSB |= Carry.getOpcode() == ISD::AND; Carry = Carry.getOperand(0); + } if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? @@ -44339,6 +45086,14 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { CarryOp1.getOpcode() == X86ISD::ADD && isOneConstant(CarryOp1.getOperand(1))) return CarryOp1; + } else if (FoundAndLSB) { + SDLoc DL(Carry); + SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType()); + if (Carry.getOpcode() == ISD::SRL) { + BitNo = Carry.getOperand(1); + Carry = Carry.getOperand(0); + } + return getBT(Carry, BitNo, DL, DAG); } } } @@ -44533,6 +45288,12 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, if (!IsAnyOf && !IsAllOf) return SDValue(); + // TODO: Check more combining cases for me. + // Here we check the cmp use number to decide do combining or not. + // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))" + // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint. + bool IsOneUse = CmpOp.getNode()->hasOneUse(); + // See if we can peek through to a vector with a wider element type, if the // signbits extend down to all the sub-elements as well. // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose @@ -44561,9 +45322,9 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). - if (VecVT.is256BitVector() && NumElts <= CmpBits) { + if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) { SmallVector Ops; - if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) && + if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) && Ops.size() == 2) { SDLoc DL(EFLAGS); EVT SubVT = Ops[0].getValueType().changeTypeToInteger(); @@ -44582,7 +45343,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)). // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)). - if (IsAllOf && Subtarget.hasSSE41()) { + if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) { MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; SDValue BC = peekThroughBitcasts(Vec); // Ensure MOVMSK was testing every signbit of BC. @@ -44734,7 +45495,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (!(FalseOp.getValueType() == MVT::f80 || (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) || (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) || - !Subtarget.hasCMov() || hasFPCMov(CC)) { + !Subtarget.canUseCMOV() || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); @@ -45181,8 +45942,6 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); - EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts); - // With AVX512 but without BWI, we would need to split v32i16. if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return SDValue(); @@ -45265,11 +46024,13 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); + MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16); + return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, + DAG.getBitcast(OpVT, Ops[0]), + DAG.getBitcast(OpVT, Ops[1])); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1}, PMADDWDBuilder); } @@ -45622,12 +46383,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, SarConst = SarConst - (Size - ShiftSize); if (SarConst == 0) return NN; - else if (SarConst.isNegative()) + if (SarConst.isNegative()) return DAG.getNode(ISD::SHL, DL, VT, NN, DAG.getConstant(-SarConst, DL, CVT)); - else - return DAG.getNode(ISD::SRA, DL, VT, NN, - DAG.getConstant(SarConst, DL, CVT)); + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); } return SDValue(); } @@ -46034,11 +46794,9 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, EltBits[0].getZExtValue(), DAG); } - APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, - KnownZero, DCI)) + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); @@ -46461,11 +47219,17 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, FPLogic); } + if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() || + !N1.hasOneUse()) + return SDValue(); + + ISD::CondCode CC0 = cast(N0.getOperand(2))->get(); + ISD::CondCode CC1 = cast(N1.getOperand(2))->get(); + // The vector ISA for FP predicates is incomplete before AVX, so converting // COMIS* to CMPS* may not be a win before AVX. - // TODO: Check types/predicates to see if they are available with SSE/SSE2. - if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || - !N0.hasOneUse() || !N1.hasOneUse()) + if (!Subtarget.hasAVX() && + !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1))) return SDValue(); // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*) @@ -46482,10 +47246,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01); SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10); SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11); - SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, - cast(N0.getOperand(2))->get()); - SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, - cast(N1.getOperand(2))->get()); + SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0); + SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1); SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex); } @@ -46891,6 +47653,53 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; + // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) + // iff c2 is all/no bits mask - i.e. a select-with-zero mask. + // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW? + if (VT.isVector() && getTargetConstantFromNode(N1)) { + unsigned Opc0 = N0.getOpcode(); + if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) && + getTargetConstantFromNode(N0.getOperand(1)) && + DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() && + N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) { + SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1); + return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul); + } + } + + // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant + // avoids slow variable shift (moving shift amount to ECX etc.) + if (isOneConstant(N1) && N0->hasOneUse()) { + SDValue Src = N0; + while ((Src.getOpcode() == ISD::ZERO_EXTEND || + Src.getOpcode() == ISD::TRUNCATE) && + Src.getOperand(0)->hasOneUse()) + Src = Src.getOperand(0); + bool ContainsNOT = false; + X86::CondCode X86CC = X86::COND_B; + // Peek through AND(NOT(SRL(X,Y)),1). + if (isBitwiseNot(Src)) { + Src = Src.getOperand(0); + X86CC = X86::COND_AE; + ContainsNOT = true; + } + if (Src.getOpcode() == ISD::SRL && + !isa(Src.getOperand(1))) { + SDValue BitNo = Src.getOperand(1); + Src = Src.getOperand(0); + // Peek through AND(SRL(NOT(X),Y),1). + if (isBitwiseNot(Src)) { + Src = Src.getOperand(0); + X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE; + ContainsNOT = true; + } + // If we have BMI2 then SHRX should be faster for i32/i64 cases. + if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32)) + if (SDValue BT = getBT(Src, BitNo, dl, DAG)) + return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT); + } + } + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { // Attempt to recursively combine a bitmask AND with shuffles. SDValue Op(N, 0); @@ -46899,32 +47708,44 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // If either operand is a constant mask, then only the elements that aren't // zero are actually demanded by the other operand. - auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { + auto GetDemandedMasks = [&](SDValue Op) { APInt UndefElts; SmallVector EltBits; int NumElts = VT.getVectorNumElements(); int EltSizeInBits = VT.getScalarSizeInBits(); - if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) - return false; - - APInt DemandedBits = APInt::getZero(EltSizeInBits); - APInt DemandedElts = APInt::getZero(NumElts); - for (int I = 0; I != NumElts; ++I) - if (!EltBits[I].isZero()) { - DemandedBits |= EltBits[I]; - DemandedElts.setBit(I); - } - - APInt KnownUndef, KnownZero; - return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, - KnownZero, DCI) || - TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI); + APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); + APInt DemandedElts = APInt::getAllOnes(NumElts); + if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, + EltBits)) { + DemandedBits.clearAllBits(); + DemandedElts.clearAllBits(); + for (int I = 0; I != NumElts; ++I) + if (!EltBits[I].isZero()) { + DemandedBits |= EltBits[I]; + DemandedElts.setBit(I); + } + } + return std::make_pair(DemandedBits, DemandedElts); }; - if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { + std::pair Demand0 = GetDemandedMasks(N1); + std::pair Demand1 = GetDemandedMasks(N0); + + if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) || + TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) || + TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) || + TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } + + SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first, + Demand0.second, DAG); + SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first, + Demand1.second, DAG); + if (NewN0 || NewN1) + return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0, + NewN1 ? NewN1 : N1); } // Attempt to combine a scalar bitmask AND with an extracted shuffle. @@ -47127,8 +47948,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // into: // srl(ctlz x), log2(bitsize(x)) // Input pattern is checked by caller. -static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, - SelectionDAG &DAG) { +static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) { SDValue Cmp = Op.getOperand(1); EVT VT = Cmp.getOperand(0).getValueType(); unsigned Log2b = Log2_32(VT.getSizeInBits()); @@ -47139,7 +47959,7 @@ static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, DAG.getConstant(Log2b, dl, MVT::i8)); - return DAG.getZExtOrTrunc(Scc, dl, ExtTy); + return Scc; } // Try to transform: @@ -47199,11 +48019,10 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, // or(srl(ctlz),srl(ctlz)). // The dag combiner can then fold it into: // srl(or(ctlz, ctlz)). - EVT VT = OR->getValueType(0); - SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG); + SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG); SDValue Ret, NewRHS; - if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))) - Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); + if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG))) + Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS); if (!Ret) return SDValue(); @@ -47216,21 +48035,18 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); - NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG); if (!NewRHS) return SDValue(); - Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); + Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS); } - if (Ret) - Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); - - return Ret; + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); } static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, - SDValue And1_L, SDValue And1_R, SDLoc DL, - SelectionDAG &DAG) { + SDValue And1_L, SDValue And1_R, + const SDLoc &DL, SelectionDAG &DAG) { if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) return SDValue(); SDValue NotOp = And0_L->getOperand(0); @@ -47352,7 +48168,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts); if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && N1.getConstantOperandAPInt(1) == HalfElts && - DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) { + DAG.MaskedVectorIsZero(N0, UpperElts)) { return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N0, 0, DAG, dl, HalfElts), @@ -47360,7 +48176,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && N0.getConstantOperandAPInt(1) == HalfElts && - DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) { + DAG.MaskedVectorIsZero(N1, UpperElts)) { return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N1, 0, DAG, dl, HalfElts), @@ -47389,9 +48205,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (!EltBits[I].isAllOnes()) DemandedElts.setBit(I); - APInt KnownUndef, KnownZero; - return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, - KnownZero, DCI); + return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI); }; if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { if (N->getOpcode() != ISD::DELETED_NODE) @@ -47618,7 +48432,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, // clip to 0-255. if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 && VT == MVT::v16i8) { - if (auto USatVal = detectSSatPattern(In, VT, true)) { + if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, DL, DAG, Subtarget); @@ -47643,7 +48457,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, VT.getSizeInBits() >= 64 && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { - if (auto USatVal = detectSSatPattern(In, VT, true)) { + if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). // Only do this when the result is at least 64 bits or we'll leaving // dangling PACKSSDW nodes. @@ -47660,7 +48474,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); } - if (auto SSatVal = detectSSatPattern(In, VT)) + if (SDValue SSatVal = detectSSatPattern(In, VT)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, Subtarget); } @@ -47671,10 +48485,10 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) { unsigned TruncOpc = 0; SDValue SatVal; - if (auto SSatVal = detectSSatPattern(In, VT)) { + if (SDValue SSatVal = detectSSatPattern(In, VT)) { SatVal = SSatVal; TruncOpc = X86ISD::VTRUNCS; - } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) { + } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) { SatVal = USatVal; TruncOpc = X86ISD::VTRUNCUS; } @@ -47706,7 +48520,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient -/// X86ISD::AVG instruction. +/// ISD::AVGCEILU (AVG) instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { @@ -47769,7 +48583,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); + return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); }; auto AVGSplitter = [&](std::array Ops) { @@ -47872,7 +48686,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && - Ld->getAlignment() >= 16) || + Ld->getAlign() >= Align(16)) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, *Ld->getMemOperand(), &Fast) && !Fast))) { @@ -48340,7 +49154,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Split under-aligned vector non-temporal stores. if (St->isNonTemporal() && StVT == VT && - St->getAlignment() < VT.getStoreSize()) { + St->getAlign().value() < VT.getStoreSize()) { // ZMM/YMM nt-stores - either it can be stored as a series of shorter // vectors or the legalizer can scalarize it to use MOVNTI. if (VT.is256BitVector() || VT.is512BitVector()) { @@ -48374,9 +49188,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } // Try to fold a VTRUNCUS or VTRUNCS into a truncating store. - if (!St->isTruncatingStore() && StoredVal.hasOneUse() && + if (!St->isTruncatingStore() && (StoredVal.getOpcode() == X86ISD::VTRUNCUS || StoredVal.getOpcode() == X86ISD::VTRUNCS) && + StoredVal.hasOneUse() && TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; return EmitTruncSStore(IsSigned, St->getChain(), @@ -48385,15 +49200,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } // Try to fold a extract_element(VTRUNC) pattern into a truncating store. - if (!St->isTruncatingStore() && StoredVal.hasOneUse()) { + if (!St->isTruncatingStore()) { auto IsExtractedElement = [](SDValue V) { - if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse()) + if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse()) V = V.getOperand(0); unsigned Opc = V.getOpcode(); - if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) { - if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1))) - return V.getOperand(0); - } + if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) && + isNullConstant(V.getOperand(1)) && V.hasOneUse() && + V.getOperand(0).hasOneUse()) + return V.getOperand(0); return SDValue(); }; if (SDValue Extract = IsExtractedElement(StoredVal)) { @@ -48531,10 +49346,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits(); APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts); - APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef, - KnownZero, DCI)) { + if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); @@ -49165,7 +49978,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, // PACK should still be worth it for 128-bit vectors if the sources were // originally concatenated from subvectors. SmallVector ConcatOps; - if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps)) + if (VT.getSizeInBits() > 128 || + !collectConcatOps(In.getNode(), ConcatOps, DAG)) return SDValue(); } @@ -49478,9 +50292,9 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, SDValue In = N->getOperand(0); SDLoc DL(N); - if (auto SSatVal = detectSSatPattern(In, VT)) + if (SDValue SSatVal = detectSSatPattern(In, VT)) return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); - if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) + if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -49567,10 +50381,14 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (!UndefElts[I] && !EltBits[I].isSignMask()) return SDValue(); - return peekThroughBitcasts(Op0); + // Only allow bitcast from correctly-sized constant. + Op0 = peekThroughBitcasts(Op0); + if (Op0.getScalarValueSizeInBits() == ScalarSize) + return Op0; } - } - } + break; + } // case + } // switch return SDValue(); } @@ -50074,10 +50892,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, - KnownZero, DCI)) + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); // Convert a full vector load into vzload when not all bits are needed. @@ -50144,26 +50960,70 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); MVT VT = N->getSimpleValueType(0); + // ANDNP(undef, x) -> 0 + // ANDNP(x, undef) -> 0 + if (N0.isUndef() || N1.isUndef()) + return DAG.getConstant(0, SDLoc(N), VT); + // ANDNP(0, x) -> x - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) - return N->getOperand(1); + if (ISD::isBuildVectorAllZeros(N0.getNode())) + return N1; // ANDNP(x, 0) -> 0 - if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) + if (ISD::isBuildVectorAllZeros(N1.getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. - if (SDValue Not = IsNOT(N->getOperand(0), DAG)) - return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), - N->getOperand(1)); + if (SDValue Not = IsNOT(N0, DAG)) + return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1); + + // TODO: Constant fold NOT(N0) to allow us to use AND. + // TODO: Do this in IsNOT with suitable oneuse checks? // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; + + // If either operand is a constant mask, then only the elements that aren't + // zero are actually demanded by the other operand. + auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { + APInt UndefElts; + SmallVector EltBits; + int NumElts = VT.getVectorNumElements(); + int EltSizeInBits = VT.getScalarSizeInBits(); + APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); + APInt DemandedElts = APInt::getAllOnes(NumElts); + if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, + EltBits)) { + DemandedBits.clearAllBits(); + DemandedElts.clearAllBits(); + for (int I = 0; I != NumElts; ++I) + if ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero())) { + DemandedBits |= Invert ? ~EltBits[I] : EltBits[I]; + DemandedElts.setBit(I); + } + } + return std::make_pair(DemandedBits, DemandedElts); + }; + std::pair Demand0 = GetDemandedMasks(N1); + std::pair Demand1 = GetDemandedMasks(N0, true); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) || + TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) || + TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) || + TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } return SDValue(); @@ -50191,11 +51051,9 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, SDValue Src = N->getOperand(IsStrict ? 1 : 0); if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { - APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getLowBitsSet(8, 4); - if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, - DCI)) { + if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); @@ -50453,110 +51311,6 @@ static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { return Res; } -// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). -// This is more or less the reverse of combineBitcastvxi1. -static SDValue -combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - unsigned Opcode = N->getOpcode(); - if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && - Opcode != ISD::ANY_EXTEND) - return SDValue(); - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - EVT VT = N->getValueType(0); - EVT SVT = VT.getScalarType(); - EVT InSVT = N0.getValueType().getScalarType(); - unsigned EltSizeInBits = SVT.getSizeInBits(); - - // Input type must be extending a bool vector (bit-casted from a scalar - // integer) to legal integer types. - if (!VT.isVector()) - return SDValue(); - if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) - return SDValue(); - if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) - return SDValue(); - - SDValue N00 = N0.getOperand(0); - EVT SclVT = N0.getOperand(0).getValueType(); - if (!SclVT.isScalarInteger()) - return SDValue(); - - SDLoc DL(N); - SDValue Vec; - SmallVector ShuffleMask; - unsigned NumElts = VT.getVectorNumElements(); - assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); - - // Broadcast the scalar integer to the vector elements. - if (NumElts > EltSizeInBits) { - // If the scalar integer is greater than the vector element size, then we - // must split it down into sub-sections for broadcasting. For example: - // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. - // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. - assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); - unsigned Scale = NumElts / EltSizeInBits; - EVT BroadcastVT = - EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); - Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); - Vec = DAG.getBitcast(VT, Vec); - - for (unsigned i = 0; i != Scale; ++i) - ShuffleMask.append(EltSizeInBits, i); - Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); - } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && - (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { - // If we have register broadcast instructions, use the scalar size as the - // element type for the shuffle. Then cast to the wider element type. The - // widened bits won't be used, and this might allow the use of a broadcast - // load. - assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"); - unsigned Scale = EltSizeInBits / NumElts; - EVT BroadcastVT = - EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); - Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); - ShuffleMask.append(NumElts * Scale, 0); - Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); - Vec = DAG.getBitcast(VT, Vec); - } else { - // For smaller scalar integers, we can simply any-extend it to the vector - // element size (we don't care about the upper bits) and broadcast it to all - // elements. - SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); - Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); - ShuffleMask.append(NumElts, 0); - Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); - } - - // Now, mask the relevant bit in each element. - SmallVector Bits; - for (unsigned i = 0; i != NumElts; ++i) { - int BitIdx = (i % EltSizeInBits); - APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); - Bits.push_back(DAG.getConstant(Bit, DL, SVT)); - } - SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); - Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); - - // Compare against the bitmask and extend the result. - EVT CCVT = VT.changeVectorElementType(MVT::i1); - Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); - Vec = DAG.getSExtOrTrunc(Vec, DL, VT); - - // For SEXT, this is now done, otherwise shift the result down for - // zero-extension. - if (Opcode == ISD::SIGN_EXTEND) - return Vec; - return DAG.getNode(ISD::SRL, DL, VT, Vec, - DAG.getConstant(EltSizeInBits - 1, DL, VT)); -} - // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, @@ -50636,7 +51390,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) + if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0, + DAG, DCI, Subtarget)) return V; if (VT.isVector()) { @@ -50790,7 +51545,8 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) + if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0, + DAG, DCI, Subtarget)) return V; if (VT.isVector()) @@ -50832,7 +51588,7 @@ static bool isOrXorXorTree(SDValue X, bool Root = true) { /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp /// expansion. -template +template static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV) { SDValue Op0 = X.getOperand(0); @@ -50845,7 +51601,8 @@ static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG, if (HasPT) return DAG.getNode(ISD::OR, DL, VecVT, A, B); return DAG.getNode(ISD::AND, DL, CmpVT, A, B); - } else if (X.getOpcode() == ISD::XOR) { + } + if (X.getOpcode() == ISD::XOR) { SDValue A = SToV(Op0); SDValue B = SToV(Op1); if (VecVT != CmpVT) @@ -51134,6 +51891,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, LHS.getValueType() == MVT::v4f32) return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); + // X pred 0.0 --> X pred -X + // If the negation of X already exists, use it in the comparison. This removes + // the need to materialize 0.0 and allows matching to SSE's MIN/MAX + // instructions in patterns with a 'select' node. + if (isNullFPScalarOrVectorConst(RHS)) { + SDVTList FNegVT = DAG.getVTList(OpVT); + if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS})) + return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC); + } + return SDValue(); } @@ -51145,16 +51912,18 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, MVT VT = N->getSimpleValueType(0); unsigned NumBits = VT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); + unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits(); + assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types"); // Perform constant folding. - if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { - assert(VT == MVT::i32 && "Unexpected result type"); + APInt UndefElts; + SmallVector EltBits; + if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) { APInt Imm(32, 0); - for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { - if (!Src.getOperand(Idx).isUndef() && - Src.getConstantOperandAPInt(Idx).isNegative()) + for (unsigned Idx = 0; Idx != NumElts; ++Idx) + if (!UndefElts[Idx] && EltBits[Idx].isNegative()) Imm.setBit(Idx); - } + return DAG.getConstant(Imm, SDLoc(N), VT); } @@ -51713,8 +52482,6 @@ static bool needCarryOrOverflowFlag(SDValue Flags) { CC = (X86::CondCode)User->getConstantOperandVal(0); break; case X86ISD::BRCOND: - CC = (X86::CondCode)User->getConstantOperandVal(2); - break; case X86ISD::CMOV: CC = (X86::CondCode)User->getConstantOperandVal(2); break; @@ -51743,10 +52510,14 @@ static bool onlyZeroFlagUsed(SDValue Flags) { default: // Be conservative. return false; - case X86ISD::SETCC: CCOpNo = 0; break; - case X86ISD::SETCC_CARRY: CCOpNo = 0; break; - case X86ISD::BRCOND: CCOpNo = 2; break; - case X86ISD::CMOV: CCOpNo = 2; break; + case X86ISD::SETCC: + case X86ISD::SETCC_CARRY: + CCOpNo = 0; + break; + case X86ISD::BRCOND: + case X86ISD::CMOV: + CCOpNo = 2; + break; } X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo); @@ -51757,6 +52528,215 @@ static bool onlyZeroFlagUsed(SDValue Flags) { return true; } +/// If this is an add or subtract where one operand is produced by a cmp+setcc, +/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} +/// with CMP+{ADC, SBB}. +/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}. +static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, + SDValue X, SDValue Y, + SelectionDAG &DAG, + bool ZeroSecondOpOnly = false) { + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // Look through a one-use zext. + if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) + Y = Y.getOperand(0); + + X86::CondCode CC; + SDValue EFLAGS; + if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) { + CC = (X86::CondCode)Y.getConstantOperandVal(0); + EFLAGS = Y.getOperand(1); + } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) && + Y.hasOneUse()) { + EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC); + } + + if (!EFLAGS) + return SDValue(); + + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + auto *ConstantX = dyn_cast(X); + if (ConstantX && !ZeroSecondOpOnly) { + if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) || + (IsSub && CC == X86::COND_B && ConstantX->isZero())) { + // This is a complicated way to get -1 or 0 from the carry flag: + // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax + // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + EFLAGS); + } + + if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) || + (IsSub && CC == X86::COND_A && ConstantX->isZero())) { + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa(EFLAGS.getOperand(1))) { + // Swap the operands of a SUB, and we have the same pattern as above. + // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB + // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + NewEFLAGS); + } + } + } + + if (CC == X86::COND_B) { + // X + SETB Z --> adc X, 0 + // X - SETB Z --> sbb X, 0 + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(0, DL, VT), EFLAGS); + } + + if (ZeroSecondOpOnly) + return SDValue(); + + if (CC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa(EFLAGS.getOperand(1))) { + SDValue NewSub = + DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(0, DL, VT), NewEFLAGS); + } + } + + if (CC == X86::COND_AE) { + // X + SETAE --> sbb X, -1 + // X - SETAE --> adc X, -1 + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(-1, DL, VT), EFLAGS); + } + + if (CC == X86::COND_BE) { + // X + SETBE --> sbb X, -1 + // X - SETBE --> adc X, -1 + // Try to convert COND_BE into COND_AE in an attempt to facilitate + // materializing "setae reg". + // + // Do not flip "e <= c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa(EFLAGS.getOperand(1))) { + SDValue NewSub = + DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(-1, DL, VT), NewEFLAGS); + } + } + + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() || + !X86::isZeroNode(EFLAGS.getOperand(1)) || + !EFLAGS.getOperand(0).getValueType().isInteger()) + return SDValue(); + + SDValue Z = EFLAGS.getOperand(0); + EVT ZVT = Z.getValueType(); + + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + if (ConstantX) { + // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with + // fake operands: + // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) + // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) + if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) || + (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) { + SDValue Zero = DAG.getConstant(0, DL, ZVT); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + } + + // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' + // with fake operands: + // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) + // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) + if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) || + (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) { + SDValue One = DAG.getConstant(1, DL, ZVT); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + Cmp1.getValue(1)); + } + } + + // (cmp Z, 1) sets the carry flag if Z is 0. + SDValue One = DAG.getConstant(1, DL, ZVT); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); + + // Add the flags type for ADC/SBB nodes. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) + // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) + if (CC == X86::COND_NE) + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, + DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1)); + + // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) + // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, + DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); +} + +/// If this is an add or subtract where one operand is produced by a cmp+setcc, +/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} +/// with CMP+{ADC, SBB}. +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { + bool IsSub = N->getOpcode() == ISD::SUB; + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG)) + return ADCOrSBB; + + // Commute and try again (negate the result for subtracts). + if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) { + if (IsSub) + ADCOrSBB = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB); + return ADCOrSBB; + } + + return SDValue(); +} + static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { // Only handle test patterns. if (!isNullConstant(N->getOperand(1))) @@ -51792,6 +52772,16 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { } } + // Peek through any zero-extend if we're only testing for a zero result. + if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) { + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() >= 8 && + DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src, + DAG.getConstant(0, dl, SrcVT)); + } + // Look for a truncate. if (Op.getOpcode() != ISD::TRUNCATE) return SDValue(); @@ -51867,7 +52857,8 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); MVT VT = LHS.getSimpleValueType(); - unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB; + bool IsSub = X86ISD::SUB == N->getOpcode(); + unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD; // If we don't use the flag result, simplify back to a generic ADD/SUB. if (!N->hasAnyUseOfValue(1)) { @@ -51889,26 +52880,29 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, MatchGeneric(LHS, RHS, false); MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); - return SDValue(); + // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the + // EFLAGS result doesn't change. + return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG, + /*ZeroSecondOpOnly*/ true); } static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue BorrowIn = N->getOperand(2); + + if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); - return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, - N->getOperand(0), N->getOperand(1), - Flags); + return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags); } // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) // iff the flag result is dead. - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) && + if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) && !N->hasAnyUseOfValue(1)) - return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0), - Op0.getOperand(1), N->getOperand(2)); + return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0), + LHS.getOperand(1), BorrowIn); return SDValue(); } @@ -51916,228 +52910,60 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + auto *LHSC = dyn_cast(LHS); + auto *RHSC = dyn_cast(RHS); + + // Canonicalize constant to RHS. + if (LHSC && !RHSC) + return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS, + CarryIn); + // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. - if (X86::isZeroNode(N->getOperand(0)) && - X86::isZeroNode(N->getOperand(1)) && + if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); - SDValue Res1 = - DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), - N->getOperand(2)), - DAG.getConstant(1, DL, VT)); + SDValue Res1 = DAG.getNode( + ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn), + DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { - MVT VT = N->getSimpleValueType(0); - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, - N->getOperand(0), N->getOperand(1), - Flags); - } - - return SDValue(); -} - -/// If this is an add or subtract where one operand is produced by a cmp+setcc, -/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} -/// with CMP+{ADC, SBB}. -static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { - bool IsSub = N->getOpcode() == ISD::SUB; - SDValue X = N->getOperand(0); - SDValue Y = N->getOperand(1); - - // If this is an add, canonicalize a zext operand to the RHS. - // TODO: Incomplete? What if both sides are zexts? - if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && - Y.getOpcode() != ISD::ZERO_EXTEND) - std::swap(X, Y); - - // Look through a one-use zext. - bool PeekedThroughZext = false; - if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { - Y = Y.getOperand(0); - PeekedThroughZext = true; - } - - // If this is an add, canonicalize a setcc operand to the RHS. - // TODO: Incomplete? What if both sides are setcc? - // TODO: Should we allow peeking through a zext of the other operand? - if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && - Y.getOpcode() != X86ISD::SETCC) - std::swap(X, Y); - - if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) - return SDValue(); - - SDLoc DL(N); - EVT VT = N->getValueType(0); - X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); - - // If X is -1 or 0, then we have an opportunity to avoid constants required in - // the general case below. - auto *ConstantX = dyn_cast(X); - if (ConstantX) { - if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) || - (IsSub && CC == X86::COND_B && ConstantX->isZero())) { - // This is a complicated way to get -1 or 0 from the carry flag: - // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax - // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax - return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), - Y.getOperand(1)); - } - - if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) || - (IsSub && CC == X86::COND_A && ConstantX->isZero())) { - SDValue EFLAGS = Y->getOperand(1); - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa(EFLAGS.getOperand(1))) { - // Swap the operands of a SUB, and we have the same pattern as above. - // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB - // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB - SDValue NewSub = DAG.getNode( - X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), - NewEFLAGS); - } - } - } - - if (CC == X86::COND_B) { - // X + SETB Z --> adc X, 0 - // X - SETB Z --> sbb X, 0 - return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, - DAG.getVTList(VT, MVT::i32), X, - DAG.getConstant(0, DL, VT), Y.getOperand(1)); - } - - if (CC == X86::COND_A) { - SDValue EFLAGS = Y.getOperand(1); - // Try to convert COND_A into COND_B in an attempt to facilitate - // materializing "setb reg". - // - // Do not flip "e > c", where "c" is a constant, because Cmp instruction - // cannot take an immediate as its first operand. - // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), - EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); - return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, - DAG.getVTList(VT, MVT::i32), X, - DAG.getConstant(0, DL, VT), NewEFLAGS); - } - } - - if (CC == X86::COND_AE) { - // X + SETAE --> sbb X, -1 - // X - SETAE --> adc X, -1 - return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, - DAG.getVTList(VT, MVT::i32), X, - DAG.getConstant(-1, DL, VT), Y.getOperand(1)); - } - - if (CC == X86::COND_BE) { - // X + SETBE --> sbb X, -1 - // X - SETBE --> adc X, -1 - SDValue EFLAGS = Y.getOperand(1); - // Try to convert COND_BE into COND_AE in an attempt to facilitate - // materializing "setae reg". - // - // Do not flip "e <= c", where "c" is a constant, because Cmp instruction - // cannot take an immediate as its first operand. - // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode( - X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); - return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, - DAG.getVTList(VT, MVT::i32), X, - DAG.getConstant(-1, DL, VT), NewEFLAGS); - } + // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry) + // iff the flag result is dead. + // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow. + if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) { + SDLoc DL(N); + APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue(); + return DAG.getNode(X86ISD::ADC, DL, N->getVTList(), + DAG.getConstant(0, DL, LHS.getValueType()), + DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn); } - if (CC != X86::COND_E && CC != X86::COND_NE) - return SDValue(); - - SDValue Cmp = Y.getOperand(1); - if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || - !X86::isZeroNode(Cmp.getOperand(1)) || - !Cmp.getOperand(0).getValueType().isInteger()) - return SDValue(); - - SDValue Z = Cmp.getOperand(0); - EVT ZVT = Z.getValueType(); - - // If X is -1 or 0, then we have an opportunity to avoid constants required in - // the general case below. - if (ConstantX) { - // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with - // fake operands: - // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) - // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) - if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) || - (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) { - SDValue Zero = DAG.getConstant(0, DL, ZVT); - SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); - return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), - SDValue(Neg.getNode(), 1)); - } - - // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' - // with fake operands: - // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) - // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) - if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) || - (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) { - SDValue One = DAG.getConstant(1, DL, ZVT); - SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); - SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); - return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), - Cmp1.getValue(1)); - } + if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) { + MVT VT = N->getSimpleValueType(0); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags); } - // (cmp Z, 1) sets the carry flag if Z is 0. - SDValue One = DAG.getConstant(1, DL, ZVT); - SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); - SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); - - // Add the flags type for ADC/SBB nodes. - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) - // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) - if (CC == X86::COND_NE) - return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, - DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1)); + // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry) + // iff the flag result is dead. + if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() && + !N->hasAnyUseOfValue(1)) + return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0), + LHS.getOperand(1), CarryIn); - // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) - // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) - return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, - DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); + return SDValue(); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, @@ -52432,7 +53258,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, /// Try to fold those constants into an 'add' instruction to reduce instruction /// count. We do this with CMOV rather the generic 'select' because there are /// earlier folds that may be used to turn select-of-constants into logic hacks. -static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) { +static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // If an operand is zero, add-of-0 gets simplified away, so that's clearly // better because we eliminate 1-2 instructions. This transform is still // an improvement without zero operands because we trade 2 move constants and @@ -52457,6 +53284,11 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) { if (!isSuitableCmov(Cmov)) return SDValue(); + // Don't remove a load folding opportunity for the add. That would neutralize + // any improvements from removing constant materializations. + if (X86::mayFoldLoad(OtherOp, Subtarget)) + return SDValue(); + EVT VT = N->getValueType(0); SDLoc DL(N); SDValue FalseOp = Cmov.getOperand(0); @@ -52499,7 +53331,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); - if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG)) + if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget)) return Select; if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget)) @@ -52535,6 +53367,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, } } + // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W) + if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() && + X86::isZeroNode(Op0.getOperand(1))) { + assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use"); + return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1, + Op0.getOperand(0), Op0.getOperand(2)); + } + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -52617,6 +53457,25 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) return V; + // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W) + if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() && + X86::isZeroNode(Op1.getOperand(1))) { + assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); + return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0, + Op1.getOperand(0), Op1.getOperand(2)); + } + + // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y) + // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds. + if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() && + !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) { + assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); + SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0, + Op1.getOperand(1), Op1.getOperand(2)); + return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0), + Op1.getOperand(0)); + } + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -52745,6 +53604,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Subs.push_back(SubOp.getOperand(I)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); }; + auto IsConcatFree = [](MVT VT, ArrayRef SubOps, unsigned Op) { + for (unsigned I = 0, E = SubOps.size(); I != E; ++I) { + SDValue Sub = SubOps[I].getOperand(Op); + unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); + if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Sub.getOperand(0).getValueType() != VT || + Sub.getConstantOperandAPInt(1) != (I * NumSubElts)) + return false; + } + return true; + }; unsigned NumOps = Ops.size(); switch (Op0.getOpcode()) { @@ -52802,6 +53672,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getTargetConstant(Idx, DL, MVT::i8)); } break; + case X86ISD::PSHUFB: + if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useBWIRegs()))) { + return DAG.getNode(Op0.getOpcode(), DL, VT, + ConcatSubOperand(VT, Ops, 0), + ConcatSubOperand(VT, Ops, 1)); + } + break; case X86ISD::VPERMV3: if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { MVT OpVT = Op0.getSimpleValueType(); @@ -52920,6 +53798,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); } break; + case ISD::VSELECT: + case X86ISD::BLENDV: + if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasInt256()) && + IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) { + EVT SelVT = Ops[0].getOperand(0).getValueType(); + SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), + ConcatSubOperand(VT, Ops, 1), + ConcatSubOperand(VT, Ops, 2)); + } + break; } } @@ -52937,12 +53828,29 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } + // Attempt to fold target constant loads. + if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) { + SmallVector EltBits; + APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements()); + for (unsigned I = 0, E = Ops.size(); I != E; ++I) { + APInt OpUndefElts; + SmallVector OpEltBits; + if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts, + OpEltBits, true, false)) + break; + EltBits.append(OpEltBits); + UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth()); + } + if (EltBits.size() == VT.getVectorNumElements()) + return getConstVector(EltBits, UndefElts, VT, DAG, DL); + } + return SDValue(); } -static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -52961,9 +53869,9 @@ static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -53044,7 +53952,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // Match concat_vector style patterns. SmallVector SubVectorOps; - if (collectConcatOps(N, SubVectorOps)) { + if (collectConcatOps(N, SubVectorOps, DAG)) { if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; @@ -53103,10 +54011,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, /// This function should only be called with legal types (otherwise, the calls /// to get simple value types will assert). static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { - SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); + SDValue Sel = Ext->getOperand(0); SmallVector CatOps; if (Sel.getOpcode() != ISD::VSELECT || - !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) + !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG)) return SDValue(); // Note: We assume simple value types because this should only be called with @@ -53154,9 +54062,9 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { return DAG.getBitcast(VT, NarrowSel); } -static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { // For AVX1 only, if we are extracting from a 256-bit and+not (which will // eventually get combined/lowered into ANDNP) with a concatenated operand, // split the 'and' into 128-bit ops to avoid the concatenate and extract. @@ -53177,6 +54085,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, EVT InVecVT = InVec.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); unsigned InSizeInBits = InVecVT.getSizeInBits(); + unsigned NumSubElts = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && @@ -53214,22 +54123,24 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, } if (InVec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getBuildVector( - VT, SDLoc(N), - InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); + return DAG.getBuildVector(VT, SDLoc(N), + InVec->ops().slice(IdxVal, NumSubElts)); - // If we are extracting from an insert into a zero vector, replace with a - // smaller insert into zero if we don't access less than the original - // subvector. Don't do this for i1 vectors. + // If we are extracting from an insert into a larger vector, replace with a + // smaller insert if we don't access less than the original subvector. Don't + // do this for i1 vectors. + // TODO: Relax the matching indices requirement? if (VT.getVectorElementType() != MVT::i1 && - InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && - InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && - ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && + InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() && + IdxVal == InVec.getConstantOperandVal(2) && InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { SDLoc DL(N); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), - InVec.getOperand(1), InVec.getOperand(2)); + SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, + InVec.getOperand(0), N->getOperand(1)); + unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal; + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt, + InVec.getOperand(1), + DAG.getVectorIdxConstant(NewIdxVal, DL)); } // If we're extracting an upper subvector from a broadcast we should just @@ -53246,8 +54157,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // Attempt to extract from the source of a shuffle vector. - if ((InSizeInBits % SizeInBits) == 0 && - (IdxVal % VT.getVectorNumElements()) == 0) { + if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) { SmallVector ShuffleMask; SmallVector ScaledMask; SmallVector ShuffleInputs; @@ -53255,7 +54165,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Decode the shuffle mask and scale it so its shuffling subvectors. if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { - unsigned SubVecIdx = IdxVal / VT.getVectorNumElements(); + unsigned SubVecIdx = IdxVal / NumSubElts; if (ScaledMask[SubVecIdx] == SM_SentinelUndef) return DAG.getUNDEF(VT); if (ScaledMask[SubVecIdx] == SM_SentinelZero) @@ -53263,7 +54173,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; if (Src.getValueSizeInBits() == InSizeInBits) { unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; - unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements(); + unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts; return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, SDLoc(N), SizeInBits); } @@ -53273,8 +54183,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. unsigned InOpcode = InVec.getOpcode(); - if (IdxVal == 0 && InVec.hasOneUse()) { - if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { + if (InVec.hasOneUse()) { + if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { @@ -53291,7 +54201,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); } } - if ((InOpcode == ISD::ANY_EXTEND || + if (IdxVal == 0 && + (InOpcode == ISD::ANY_EXTEND || InOpcode == ISD::ANY_EXTEND_VECTOR_INREG || InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || @@ -53306,7 +54217,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); return DAG.getNode(ExtOp, DL, VT, Ext); } - if (InOpcode == ISD::VSELECT && + if (IdxVal == 0 && InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && InVec.getOperand(1).getValueType().is256BitVector() && InVec.getOperand(2).getValueType().is256BitVector()) { @@ -53316,7 +54227,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } - if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && + if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && (VT.is128BitVector() || VT.is256BitVector())) { SDLoc DL(N); SDValue InVecSrc = InVec.getOperand(0); @@ -53324,6 +54235,13 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); return DAG.getNode(InOpcode, DL, VT, Ext); } + if (InOpcode == X86ISD::MOVDDUP && + (VT.is128BitVector() || VT.is256BitVector())) { + SDLoc DL(N); + SDValue Ext0 = + extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext0); + } } // Always split vXi64 logical shifts where we're extracting the upper 32-bits @@ -53476,11 +54394,9 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, ISD::isBuildVectorAllZeros(RHS.getNode())) return DAG.getConstant(0, SDLoc(N), VT); - APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, - KnownZero, DCI)) + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); @@ -53494,6 +54410,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, unsigned Opcode = N->getOpcode(); unsigned InOpcode = In.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc DL(N); // Try to merge vector loads and extend_inreg to an extload. if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && @@ -53506,10 +54423,9 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, : ISD::ZEXTLOAD; EVT MemVT = VT.changeVectorElementType(SVT); if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { - SDValue Load = - DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), - Ld->getMemOperand()->getFlags()); + SDValue Load = DAG.getExtLoad( + Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; } @@ -53518,7 +54434,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X). if (Opcode == InOpcode) - return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0)); + return DAG.getNode(Opcode, DL, VT, In.getOperand(0)); // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0)) // -> EXTEND_VECTOR_INREG(X). @@ -53527,12 +54443,26 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) && In.getOperand(0).getOperand(0).getValueSizeInBits() == In.getValueSizeInBits()) - return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0)); + return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0)); - // Attempt to combine as a shuffle. - // TODO: General ZERO_EXTEND_VECTOR_INREG support. - if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG || - (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) { + // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0). + // TODO: Move to DAGCombine? + if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && + In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() && + In.getValueSizeInBits() == VT.getSizeInBits()) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits(); + EVT EltVT = In.getOperand(0).getValueType(); + SmallVector Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT)); + for (unsigned I = 0; I != NumElts; ++I) + Elts[I * Scale] = In.getOperand(I); + return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts)); + } + + // Attempt to combine as a shuffle on SSE41+ targets. + if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG || + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) && + Subtarget.hasSSE41()) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) @@ -53549,11 +54479,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return DAG.getConstant(0, SDLoc(N), VT); - APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, - KnownZero, DCI)) + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); @@ -53781,11 +54709,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::CONCAT_VECTORS: - return combineConcatVectors(N, DAG, DCI, Subtarget); + return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: - return combineInsertSubvector(N, DAG, DCI, Subtarget); + return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: - return combineExtractSubvector(N, DAG, DCI, Subtarget); + return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget); @@ -54397,37 +55325,37 @@ TargetLowering::ConstraintWeight weight = CW_Register; break; case 'I': - if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { + if (auto *C = dyn_cast(info.CallOperandVal)) { if (C->getZExtValue() <= 31) weight = CW_Constant; } break; case 'J': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 63) weight = CW_Constant; } break; case 'K': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) weight = CW_Constant; } break; case 'L': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) weight = CW_Constant; } break; case 'M': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 3) weight = CW_Constant; } break; case 'N': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xff) weight = CW_Constant; } @@ -54439,14 +55367,14 @@ TargetLowering::ConstraintWeight } break; case 'e': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) weight = CW_Constant; } break; case 'Z': - if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (auto *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xffffffff) weight = CW_Constant; } @@ -54511,7 +55439,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, switch (ConstraintLetter) { default: break; case 'I': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54520,7 +55448,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'J': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54529,7 +55457,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'K': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54538,7 +55466,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'L': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), @@ -54548,7 +55476,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'M': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54557,7 +55485,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'N': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54566,7 +55494,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; case 'O': - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); @@ -54576,7 +55504,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'e': { // 32-bit signed value - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. @@ -54590,7 +55518,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } case 'Z': { // 32-bit unsigned value - if (ConstantSDNode *C = dyn_cast(Op)) { + if (auto *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), @@ -54604,7 +55532,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } case 'i': { // Literal immediates are always ok. - if (ConstantSDNode *CST = dyn_cast(Op)) { + if (auto *CST = dyn_cast(Op)) { bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; BooleanContent BCont = getBooleanContents(MVT::i64); ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) @@ -54617,8 +55545,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't - // be used as immediates. - if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) + // be used as immediates. BlockAddresses are fine though. + if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) && + !isa(Op)) return; // If we are in non-pic codegen mode, we allow the address of a global (with diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3f6d567d3f4d..af110884049b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -249,9 +249,6 @@ namespace llvm { SCALEFS, SCALEFS_RND, - // Unsigned Integer average. - AVG, - /// Integer horizontal add/sub. HADD, HSUB, @@ -790,6 +787,9 @@ namespace llvm { LOR, LXOR, LAND, + LBTS, + LBTC, + LBTR, // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, @@ -1039,10 +1039,7 @@ namespace llvm { bool isCtlzFast() const override; - bool hasBitPreservingFPLogic(EVT VT) const override { - return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || - (VT == MVT::f16 && X86ScalarSSEf16); - } + bool hasBitPreservingFPLogic(EVT VT) const override; bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { // If the pair to store is a mixture of float and int values, we will @@ -1163,6 +1160,19 @@ namespace llvm { APInt &UndefElts, unsigned Depth) const override; + bool isTargetCanonicalConstantNode(SDValue Op) const override { + // Peek through bitcasts/extracts/inserts to see if we have a broadcast + // vector from memory. + while (Op.getOpcode() == ISD::BITCAST || + Op.getOpcode() == ISD::EXTRACT_SUBVECTOR || + (Op.getOpcode() == ISD::INSERT_SUBVECTOR && + Op.getOperand(0).isUndef())) + Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0); + + return Op.getOpcode() == X86ISD::VBROADCAST_LOAD || + TargetLowering::isTargetCanonicalConstantNode(Op); + } + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; @@ -1288,6 +1298,9 @@ namespace llvm { /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, + EVT VT) const override; + /// Given an intrinsic, checks if on the target the intrinsic will need to map /// to a MemIntrinsicNode (touches memory). If this is the case, it returns /// true and stores the intrinsic information into the IntrinsicInfo that was @@ -1316,15 +1329,13 @@ namespace llvm { /// Returns true if lowering to a jump table is allowed. bool areJTsAllowed(const Function *Fn) const override; + MVT getPreferredSwitchConditionType(LLVMContext &Context, + EVT ConditionVT) const override; + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. - bool ShouldShrinkFPConstant(EVT VT) const override { - // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more - // expensive than a straight movsd. On the other hand, it's important to - // shrink long double fp constant since fldt is very slow. - return !X86ScalarSSEf64 || VT == MVT::f80; - } + bool ShouldShrinkFPConstant(EVT VT) const override; /// Return true if we believe it is correct and profitable to reduce the /// load node to a smaller type. @@ -1333,11 +1344,7 @@ namespace llvm { /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. - bool isScalarFPTypeInSSEReg(EVT VT) const { - return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 - (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 - } + bool isScalarFPTypeInSSEReg(EVT VT) const; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. @@ -1491,13 +1498,6 @@ namespace llvm { /// make the right decision when generating code for different targets. const X86Subtarget &Subtarget; - /// Select between SSE or x87 floating point ops. - /// When SSE is available, use it for f32 operations. - /// When SSE2 is available, use it for f64 operations. - bool X86ScalarSSEf32; - bool X86ScalarSSEf64; - bool X86ScalarSSEf16; - /// A list of legal FP immediates. std::vector LegalFPImmediates; @@ -1637,9 +1637,13 @@ namespace llvm { TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; - bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const; + void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; @@ -1649,6 +1653,8 @@ namespace llvm { bool needsCmpXchgNb(Type *MemType) const; + template bool isSoftFP16(T VT) const; + void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp index e08b4b7c03c6..001aa2dcb879 100644 --- a/llvm/lib/Target/X86/X86IndirectThunks.cpp +++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp @@ -31,6 +31,7 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp index 004e6fa5ebf4..08dc514a6476 100644 --- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp +++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp @@ -23,6 +23,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/ProfileData/SampleProf.h" diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index ff8710634e89..c098122685be 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -354,10 +354,9 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, // If the shift amount is guaranteed to be in-range we can replace it with a // generic shift. - APInt UpperBits = - APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); - if (llvm::MaskedValueIsZero(Amt, UpperBits, - II.getModule()->getDataLayout())) { + KnownBits KnownAmt = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmt.getMaxValue().ult(BitWidth)) { return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) : Builder.CreateLShr(Vec, Amt)) : Builder.CreateAShr(Vec, Amt)); @@ -521,11 +520,10 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II, // %int = bitcast <16 x i1> %cmp to i16 // %res = zext i16 %int to i32 unsigned NumElts = ArgTy->getNumElements(); - Type *IntegerVecTy = VectorType::getInteger(ArgTy); Type *IntegerTy = Builder.getIntNTy(NumElts); - Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); - Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); + Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); + Res = Builder.CreateIsNeg(Res); Res = Builder.CreateBitCast(Res, IntegerTy); Res = Builder.CreateZExtOrTrunc(Res, ResTy); return Res; @@ -997,20 +995,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, II.getArgOperand(0)); } - if (MaskC->getValue().isShiftedMask()) { + unsigned MaskIdx, MaskLen; + if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { // any single contingous sequence of 1s anywhere in the mask simply // describes a subset of the input bits shifted to the appropriate // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); Value *Input = II.getArgOperand(0); Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); - Value *Shifted = IC.Builder.CreateLShr(Masked, - ConstantInt::get(II.getType(), - ShiftAmount)); + Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); + Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); return IC.replaceInstUsesWith(II, Shifted); } - if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { uint64_t Src = SrcC->getZExtValue(); uint64_t Mask = MaskC->getZExtValue(); @@ -1042,15 +1038,15 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (MaskC->isAllOnesValue()) { return IC.replaceInstUsesWith(II, II.getArgOperand(0)); } - if (MaskC->getValue().isShiftedMask()) { + + unsigned MaskIdx, MaskLen; + if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { // any single contingous sequence of 1s anywhere in the mask simply // describes a subset of the input bits shifted to the appropriate // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); Value *Input = II.getArgOperand(0); - Value *Shifted = IC.Builder.CreateShl(Input, - ConstantInt::get(II.getType(), - ShiftAmount)); + Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); + Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); return IC.replaceInstUsesWith(II, Masked); } @@ -1934,6 +1930,23 @@ Optional X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( break; } + // General per-element vector operations. + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: { + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + UndefElts &= UndefElts2; + break; + } + case Intrinsic::x86_sse2_packssdw_128: case Intrinsic::x86_sse2_packsswb_128: case Intrinsic::x86_sse2_packuswb_128: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index d825981a6b36..5da06bc87b06 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,18 +48,23 @@ let Predicates = [HasAMXTILE, In64BitMode] in { VEX, T8XD; // Pseduo instruction for RA. - def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), - [(int_x86_ldtilecfg_internal addr:$src)]>; + let isPseudo = true, mayLoad = 1, hasSideEffects = 1, + Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), []>; + let isPseudo = true, mayLoad = 1 in def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + let isPseudo = true, mayLoad = 1 in def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + let isPseudo = true, mayStore = 1 in def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; - let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in + let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1, + canFoldAsLoad = 1 in def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2), [(set TILE:$dst, (int_x86_tilezero_internal GR16:$src1, GR16:$src2))]>; @@ -67,9 +72,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in { let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp + let mayLoad = 1 in def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayLoad = 1 in def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayStore = 1 in def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), [(int_x86_tilezero timm:$src)]>; @@ -99,7 +107,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in { } // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in { + let isPseudo = true, Constraints = "$src4 = $dst" in { def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), @@ -158,7 +166,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in { []>, VEX_4V, T8XS; // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in + let isPseudo = true, Constraints = "$src4 = $dst" in def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index bc67d1f89d7f..48da7b3ac882 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -476,6 +476,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; } @@ -508,25 +509,23 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>; } -let Predicates = [HasFP16] in { -def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>; -def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>; -def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>; -} - // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { + def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", + [(set FR16X:$dst, fp16imm0)]>; def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", @@ -535,12 +534,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, [(set VR128X:$dst, fp128imm0)]>; } -let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in { - def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", - [(set FR16X:$dst, fp16imm0)]>; -} - //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // @@ -678,21 +671,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; // Codegen pattern with the alternative types insert VEC128 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; // Codegen pattern with the alternative types insert VEC256 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; multiclass vinsert_for_mask_cast; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; // Codegen pattern with the alternative types extract VEC128 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, @@ -987,14 +980,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; // Codegen pattern with the alternative types extract VEC256 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; // A 128-bit extract from bits [255:128] of a 512-bit vector should use a @@ -1020,6 +1013,10 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), (v8i16 (VEXTRACTI128rr (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), (iPTR 1)))>; +def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), + (v8f16 (VEXTRACTF128rr + (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), + (iPTR 1)))>; def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), (v16i8 (VEXTRACTI128rr (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), @@ -1049,18 +1046,16 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), (v8i16 (VEXTRACTI32x4Z256rr (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), (iPTR 1)))>; +def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), + (v8f16 (VEXTRACTF32x4Z256rr + (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), + (iPTR 1)))>; def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), (v16i8 (VEXTRACTI32x4Z256rr (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), (iPTR 1)))>; } -let Predicates = [HasFP16, HasVLX] in -def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), - (v8f16 (VEXTRACTF32x4Z256rr - (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), - (iPTR 1)))>; - // Additional patterns for handling a bitcast between the vselect and the // extract_subvector. @@ -1478,7 +1473,7 @@ multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } -let Predicates = [HasFP16] in { +let Predicates = [HasBWI] in { def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)), (VPBROADCASTWZrm addr:$src)>; @@ -1487,7 +1482,7 @@ let Predicates = [HasFP16] in { def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))), (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>; } -let Predicates = [HasVLX, HasFP16] in { +let Predicates = [HasVLX, HasBWI] in { def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), (VPBROADCASTWZ128rm addr:$src)>; def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), @@ -3763,6 +3758,9 @@ let Predicates = [HasBWI, NoVLX] in { defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>; defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>; + + defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>; + defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>; } let Predicates = [HasAVX512] in { @@ -3852,7 +3850,7 @@ let Predicates = [HasVLX] in { def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } -let Predicates = [HasFP16] in { +let Predicates = [HasBWI] in { def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))), (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), @@ -3887,7 +3885,7 @@ let Predicates = [HasFP16] in { def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } -let Predicates = [HasFP16, HasVLX] in { +let Predicates = [HasBWI, HasVLX] in { def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), @@ -4099,14 +4097,14 @@ def : Pat<(f64 (bitconvert VK64:$src)), //===----------------------------------------------------------------------===// multiclass avx512_move_scalar prd = [HasAVX512, OptForSize]> { - let Predicates = prd in + X86VectorVTInfo _, Predicate prd = HasAVX512> { + let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))], _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; + let Predicates = [prd] in { def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", @@ -4159,6 +4157,7 @@ multiclass avx512_move_scalar, EVEX, EVEX_K, Sched<[WriteFStore]>, NotMemoryFoldable; + } } defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>, @@ -4168,7 +4167,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info, - [HasFP16]>, + HasFP16>, VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>; multiclass avx512_move_scalar_lowering; } -defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>; defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; -defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; -defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4353,6 +4347,12 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; +let Predicates = [HasFP16] in { +defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>; +defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, (v32i1 (insert_subvector (v32i1 immAllZerosV), @@ -4360,6 +4360,30 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, (iPTR 0))), (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), GR8, sub_8bit>; + +defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; +defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (insert_subvector + (v32i1 immAllZerosV), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + GR8, sub_8bit>; + +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk + (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)), + VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; + +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; +} + defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4385,10 +4409,6 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; -defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; -defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4396,13 +4416,6 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; -defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (insert_subvector - (v32i1 immAllZerosV), - (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), - (iPTR 0))), - (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), - GR8, sub_8bit>; defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4428,16 +4441,6 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; -def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))), - (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk - (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)), - VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), - (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; - -def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)), - (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), - (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; - def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)), @@ -5039,7 +5042,7 @@ defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul, HasBWI, 1>; defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SchedWriteVecIMul, HasBWI, 1>, T8PD; -defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu, SchedWriteVecALU, HasBWI, 1>; defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, SchedWriteVecIMul, HasAVX512, 1>, T8PD; @@ -11651,6 +11654,14 @@ defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +// Always select FP16 instructions if available. +let Predicates = [HasBWI], AddedComplexity = -10 in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>; + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>; +} + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations //===----------------------------------------------------------------------===// @@ -12988,7 +12999,6 @@ def : Pat<(i16 (bitconvert FR16X:$src)), sub_16bit))>; def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))), (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>; -} // Allow "vmovw" to use GR64 let hasSideEffects = 0 in { @@ -12997,6 +13007,7 @@ let hasSideEffects = 0 in { def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>; } +} // Convert 16-bit float to i16/u16 multiclass avx512_cvtph2w opc, string OpcodeStr, SDPatternOperator OpNode, diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index 8337d2b37383..f08ecdf6afc9 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -541,7 +541,7 @@ class X86TypeInfo { + bit hasREX_W> { /// VT - This is the value type itself. ValueType VT = vt; @@ -596,9 +596,9 @@ class X86TypeInfo>", SDTIntLeaf,[],"<>">; @@ -634,7 +634,7 @@ class ITy opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, // Infer instruction prefixes from type info. let OpSize = typeinfo.OpSize; - let hasREX_WPrefix = typeinfo.HasREX_WPrefix; + let hasREX_W = typeinfo.HasREX_W; } // BinOpRR - Instructions like "add reg, reg, reg". diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 330b8c7a8a43..79ac2a2d8019 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -14,7 +14,7 @@ // CMOV instructions. let isCodeGenOnly = 1, ForceDisassemble = 1 in { -let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", +let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst", isCommutable = 1, SchedRW = [WriteCMOV] in { def CMOV16rr : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond), @@ -35,7 +35,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB; } -let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", +let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst", SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { def CMOV16rm : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond), @@ -52,7 +52,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), timm:$cond, EFLAGS))]>, TB; -} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst" } // isCodeGenOnly = 1, ForceDisassemble = 1 def inv_cond_XFORM : SDNodeXForm; def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS), diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 7288ce812138..a55b95960aa6 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -544,10 +544,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { // i8 register pressure. defm _GR8 : CMOVrr_PSEUDO; - let Predicates = [NoCMov] in { + let Predicates = [NoCMOV] in { defm _GR32 : CMOVrr_PSEUDO; defm _GR16 : CMOVrr_PSEUDO; - } // Predicates = [NoCMov] + } // Predicates = [NoCMOV] // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no // SSE1/SSE2. @@ -562,12 +562,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { let Predicates = [HasMMX] in defm _VR64 : CMOVrr_PSEUDO; - defm _FR16X : CMOVrr_PSEUDO; let Predicates = [HasSSE1,NoAVX512] in defm _FR32 : CMOVrr_PSEUDO; - let Predicates = [HasSSE2,NoAVX512] in + let Predicates = [HasSSE2,NoAVX512] in { + defm _FR16 : CMOVrr_PSEUDO; defm _FR64 : CMOVrr_PSEUDO; + } let Predicates = [HasAVX512] in { + defm _FR16X : CMOVrr_PSEUDO; defm _FR32X : CMOVrr_PSEUDO; defm _FR64X : CMOVrr_PSEUDO; } @@ -670,7 +672,7 @@ def OR32mi8Locked : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$zero), Requires<[Not64BitMode]>, OpSize32, LOCK, Sched<[WriteALURMW]>; -let hasSideEffects = 1 in +let hasSideEffects = 1, isMeta = 1 in def Int_MemBarrier : I<0, Pseudo, (outs), (ins), "#MEMBARRIER", [(X86MemBarrier)]>, Sched<[WriteLoad]>; @@ -839,6 +841,38 @@ let Predicates = [UseIncDec] in { def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>; } +// Atomic bit test. +def X86LBTest : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, + SDTCisVT<2, i8>, SDTCisVT<3, i32>]>; +def x86bts : SDNode<"X86ISD::LBTS", X86LBTest, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def x86btc : SDNode<"X86ISD::LBTC", X86LBTest, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def x86btr : SDNode<"X86ISD::LBTR", X86LBTest, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; + +multiclass ATOMIC_LOGIC_OP { + let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteBitTestSetRegRMW] in { + def 16m : Ii8<0xBA, Form, (outs), (ins i16mem:$src1, i8imm:$src2), + !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (!cast("x86" # s) addr:$src1, timm:$src2, (i32 16)))]>, + OpSize16, TB, LOCK; + def 32m : Ii8<0xBA, Form, (outs), (ins i32mem:$src1, i8imm:$src2), + !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (!cast("x86" # s) addr:$src1, timm:$src2, (i32 32)))]>, + OpSize32, TB, LOCK; + def 64m : RIi8<0xBA, Form, (outs), (ins i64mem:$src1, i8imm:$src2), + !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (!cast("x86" # s) addr:$src1, timm:$src2, (i32 64)))]>, + TB, LOCK; + } +} + +defm LOCK_BTS : ATOMIC_LOGIC_OP; +defm LOCK_BTC : ATOMIC_LOGIC_OP; +defm LOCK_BTR : ATOMIC_LOGIC_OP; + // Atomic compare and swap. multiclass LCMPXCHG_BinOp Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag> { @@ -863,7 +897,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCX8], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, usesCustomInserter = 1 in { def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), "cmpxchg8b\t$ptr", @@ -871,7 +905,7 @@ def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), } let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in { def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), "cmpxchg16b\t$ptr", @@ -898,7 +932,7 @@ def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), // the instruction and we are sure we will have a valid register to restore // the value of RBX. let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$rbx_save = $dst" in { @@ -910,7 +944,7 @@ def LCMPXCHG16B_SAVE_RBX : // Pseudo instruction that doesn't read/write RBX. Will be turned into either // LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter. let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0, usesCustomInserter = 1 in { @@ -1235,6 +1269,21 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), return true; }]>; +def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 1; + const SDValue& BasePtr = cast(N->getOperand(1))->getBasePtr(); + if (isa(BasePtr)) + NumRegs = 3; + else if (BasePtr->getNumOperands() && isa(BasePtr->getOperand(0))) + NumRegs = 3; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa(N->getOperand(i)) && ( NumRegs-- == 0)) + return false; + return true; +}]>; + def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>, Requires<[Not64BitMode, NotUseIndirectThunkCalls]>; @@ -1242,7 +1291,8 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. -def : Pat<(X86tcret (load addr:$dst), timm:$off), +// Similar to X86tcret_6regs, here we only have 1 register left +def : Pat<(X86tcret_1reg (load addr:$dst), timm:$off), (TCRETURNmi addr:$dst, timm:$off)>, Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>; @@ -1466,6 +1516,21 @@ def ADD64ri32_DB : I<0, Pseudo, } } // AddedComplexity, SchedRW +//===----------------------------------------------------------------------===// +// Pattern match XOR as ADD +//===----------------------------------------------------------------------===// + +// Prefer to pattern match XOR with min_signed_value as ADD at isel time. +// ADD can be 3-addressified into an LEA instruction to avoid copies. +let AddedComplexity = 5 in { +def : Pat<(xor GR8:$src1, -128), + (ADD8ri GR8:$src1, -128)>; +def : Pat<(xor GR16:$src1, -32768), + (ADD16ri GR16:$src1, -32768)>; +def : Pat<(xor GR32:$src1, -2147483648), + (ADD32ri GR32:$src1, -2147483648)>; +} + //===----------------------------------------------------------------------===// // Pattern match SUB as XOR //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index 6d969962afff..aa89a6f0ff9d 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -147,7 +147,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // Win64 wants indirect jumps leaving the function to have a REX_W prefix. // These are switched from TAILJMPr/m64_REX in MCInstLower. - let isCodeGenOnly = 1, hasREX_WPrefix = 1 in { + let isCodeGenOnly = 1, hasREX_W = 1 in { def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>; let mayLoad = 1 in @@ -384,7 +384,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, []>, Sched<[WriteJumpLd]>; // Win64 wants indirect jumps leaving the function to have a REX_W prefix. - let hasREX_WPrefix = 1 in { + let hasREX_W = 1 in { def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), []>, Sched<[WriteJump]>; diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td index e310f369be08..a68d61043c5c 100644 --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -423,9 +423,9 @@ def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; // Floating point cmovs. class FpIf32CMov pattern> : - FpI_, Requires<[FPStackf32, HasCMov]>; + FpI_, Requires<[FPStackf32, HasCMOV]>; class FpIf64CMov pattern> : - FpI_, Requires<[FPStackf64, HasCMov]>; + FpI_, Requires<[FPStackf64, HasCMOV]>; multiclass FPCMov { def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), @@ -440,7 +440,7 @@ multiclass FPCMov { CondMovFP, [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, cc, EFLAGS))]>, - Requires<[HasCMov]>; + Requires<[HasCMOV]>; } let SchedRW = [WriteFCMOV] in { @@ -455,7 +455,7 @@ defm CMOVNE : FPCMov; defm CMOVNP : FPCMov; } // Uses = [EFLAGS], Constraints = "$src1 = $dst" -let Predicates = [HasCMov] in { +let Predicates = [HasCMOV] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op), "fcmovb\t{$op, %st|st, $op}">; @@ -473,7 +473,7 @@ def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op), "fcmovne\t{$op, %st|st, $op}">; def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op), "fcmovnu\t{$op, %st|st, $op}">; -} // Predicates = [HasCMov] +} // Predicates = [HasCMOV] } // SchedRW let mayRaiseFPException = 1 in { @@ -664,22 +664,22 @@ let SchedRW = [WriteFCom], mayRaiseFPException = 1 in { let Defs = [EFLAGS, FPSW], Uses = [FPCW] in { def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>, - Requires<[FPStackf32, HasCMov]>; + Requires<[FPStackf32, HasCMOV]>; def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, [(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>, - Requires<[FPStackf64, HasCMov]>; + Requires<[FPStackf64, HasCMOV]>; def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>, - Requires<[HasCMov]>; + Requires<[HasCMOV]>; def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>, - Requires<[FPStackf32, HasCMov]>; + Requires<[FPStackf32, HasCMOV]>; def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, [(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>, - Requires<[FPStackf64, HasCMov]>; + Requires<[FPStackf64, HasCMOV]>; def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>, - Requires<[HasCMov]>; + Requires<[HasCMOV]>; } let Uses = [ST0, FPCW] in { diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 226349485238..27220a8d4d99 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -292,8 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD }, { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, { X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD }, - { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE }, - { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MMX_MOVD64from64rr, X86::MMX_MOVQ64mr, TB_FOLDED_STORE }, + { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE }, { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index 0e7033fc233a..3a44b4570e9b 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -196,7 +196,7 @@ class OpSize32 { OperandSize OpSize = OpSize32; } class AdSize16 { AddressSize AdSize = AdSize16; } class AdSize32 { AddressSize AdSize = AdSize32; } class AdSize64 { AddressSize AdSize = AdSize64; } -class REX_W { bit hasREX_WPrefix = 1; } +class REX_W { bit hasREX_W = 1; } class LOCK { bit hasLockPrefix = 1; } class REP { bit hasREPPrefix = 1; } class TB { Map OpMap = TB; } @@ -316,7 +316,7 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bits<3> OpPrefixBits = OpPrefix.Value; Map OpMap = OB; // Which opcode map does this inst have? bits<4> OpMapBits = OpMap.Value; - bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? + bit hasREX_W = 0; // Does this inst require the REX.W prefix? FPFormat FPForm = NotFP; // What flavor of FP instruction is this? bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? Domain ExeDomain = d; @@ -375,7 +375,7 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, // No need for 3rd bit, we don't need to distinguish NoPrfx from PS. let TSFlags{12-11} = OpPrefixBits{1-0}; let TSFlags{16-13} = OpMapBits; - let TSFlags{17} = hasREX_WPrefix; + let TSFlags{17} = hasREX_W; let TSFlags{21-18} = ImmT.Value; let TSFlags{24-22} = FPForm.Value; let TSFlags{25} = hasLockPrefix; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 166f1f8c3251..57ba4683c6a4 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -287,7 +287,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<2, 1>]>; def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; -def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4dcd886fa3b2..ec32ac2acad1 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -25,13 +25,16 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -137,298 +140,70 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - // By default, assume that the instruction is not data invariant. + if (MI.mayLoad() || MI.mayStore()) return false; - // Some target-independent operations that trivially lower to data-invariant - // instructions. - case TargetOpcode::COPY: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: + // Some target-independent operations that trivially lower to data-invariant + // instructions. + if (MI.isCopyLike() || MI.isInsertSubreg()) return true; + unsigned Opcode = MI.getOpcode(); + using namespace X86; // On x86 it is believed that imul is constant time w.r.t. the loaded data. // However, they set flags and are perhaps the most surprisingly constant // time operations so we call them out here separately. - case X86::IMUL16rr: - case X86::IMUL16rri8: - case X86::IMUL16rri: - case X86::IMUL32rr: - case X86::IMUL32rri8: - case X86::IMUL32rri: - case X86::IMUL64rr: - case X86::IMUL64rri32: - case X86::IMUL64rri8: - + if (isIMUL(Opcode)) + return true; // Bit scanning and counting instructions that are somewhat surprisingly // constant time as they scan across bits and do other fairly complex // operations like popcnt, but are believed to be constant time on x86. // However, these set flags. - case X86::BSF16rr: - case X86::BSF32rr: - case X86::BSF64rr: - case X86::BSR16rr: - case X86::BSR32rr: - case X86::BSR64rr: - case X86::LZCNT16rr: - case X86::LZCNT32rr: - case X86::LZCNT64rr: - case X86::POPCNT16rr: - case X86::POPCNT32rr: - case X86::POPCNT64rr: - case X86::TZCNT16rr: - case X86::TZCNT32rr: - case X86::TZCNT64rr: - + if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) || + isTZCNT(Opcode)) + return true; // Bit manipulation instructions are effectively combinations of basic // arithmetic ops, and should still execute in constant time. These also // set flags. - case X86::BLCFILL32rr: - case X86::BLCFILL64rr: - case X86::BLCI32rr: - case X86::BLCI64rr: - case X86::BLCIC32rr: - case X86::BLCIC64rr: - case X86::BLCMSK32rr: - case X86::BLCMSK64rr: - case X86::BLCS32rr: - case X86::BLCS64rr: - case X86::BLSFILL32rr: - case X86::BLSFILL64rr: - case X86::BLSI32rr: - case X86::BLSI64rr: - case X86::BLSIC32rr: - case X86::BLSIC64rr: - case X86::BLSMSK32rr: - case X86::BLSMSK64rr: - case X86::BLSR32rr: - case X86::BLSR64rr: - case X86::TZMSK32rr: - case X86::TZMSK64rr: - + if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) || + isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) || + isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) || + isTZMSK(Opcode)) + return true; // Bit extracting and clearing instructions should execute in constant time, // and set flags. - case X86::BEXTR32rr: - case X86::BEXTR64rr: - case X86::BEXTRI32ri: - case X86::BEXTRI64ri: - case X86::BZHI32rr: - case X86::BZHI64rr: - + if (isBEXTR(Opcode) || isBZHI(Opcode)) + return true; // Shift and rotate. - case X86::ROL8r1: - case X86::ROL16r1: - case X86::ROL32r1: - case X86::ROL64r1: - case X86::ROL8rCL: - case X86::ROL16rCL: - case X86::ROL32rCL: - case X86::ROL64rCL: - case X86::ROL8ri: - case X86::ROL16ri: - case X86::ROL32ri: - case X86::ROL64ri: - case X86::ROR8r1: - case X86::ROR16r1: - case X86::ROR32r1: - case X86::ROR64r1: - case X86::ROR8rCL: - case X86::ROR16rCL: - case X86::ROR32rCL: - case X86::ROR64rCL: - case X86::ROR8ri: - case X86::ROR16ri: - case X86::ROR32ri: - case X86::ROR64ri: - case X86::SAR8r1: - case X86::SAR16r1: - case X86::SAR32r1: - case X86::SAR64r1: - case X86::SAR8rCL: - case X86::SAR16rCL: - case X86::SAR32rCL: - case X86::SAR64rCL: - case X86::SAR8ri: - case X86::SAR16ri: - case X86::SAR32ri: - case X86::SAR64ri: - case X86::SHL8r1: - case X86::SHL16r1: - case X86::SHL32r1: - case X86::SHL64r1: - case X86::SHL8rCL: - case X86::SHL16rCL: - case X86::SHL32rCL: - case X86::SHL64rCL: - case X86::SHL8ri: - case X86::SHL16ri: - case X86::SHL32ri: - case X86::SHL64ri: - case X86::SHR8r1: - case X86::SHR16r1: - case X86::SHR32r1: - case X86::SHR64r1: - case X86::SHR8rCL: - case X86::SHR16rCL: - case X86::SHR32rCL: - case X86::SHR64rCL: - case X86::SHR8ri: - case X86::SHR16ri: - case X86::SHR32ri: - case X86::SHR64ri: - case X86::SHLD16rrCL: - case X86::SHLD32rrCL: - case X86::SHLD64rrCL: - case X86::SHLD16rri8: - case X86::SHLD32rri8: - case X86::SHLD64rri8: - case X86::SHRD16rrCL: - case X86::SHRD32rrCL: - case X86::SHRD64rrCL: - case X86::SHRD16rri8: - case X86::SHRD32rri8: - case X86::SHRD64rri8: - + if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) || + isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode)) + return true; // Basic arithmetic is constant time on the input but does set flags. - case X86::ADC8rr: - case X86::ADC8ri: - case X86::ADC16rr: - case X86::ADC16ri: - case X86::ADC16ri8: - case X86::ADC32rr: - case X86::ADC32ri: - case X86::ADC32ri8: - case X86::ADC64rr: - case X86::ADC64ri8: - case X86::ADC64ri32: - case X86::ADD8rr: - case X86::ADD8ri: - case X86::ADD16rr: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD32rr: - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD64rr: - case X86::ADD64ri8: - case X86::ADD64ri32: - case X86::AND8rr: - case X86::AND8ri: - case X86::AND16rr: - case X86::AND16ri: - case X86::AND16ri8: - case X86::AND32rr: - case X86::AND32ri: - case X86::AND32ri8: - case X86::AND64rr: - case X86::AND64ri8: - case X86::AND64ri32: - case X86::OR8rr: - case X86::OR8ri: - case X86::OR16rr: - case X86::OR16ri: - case X86::OR16ri8: - case X86::OR32rr: - case X86::OR32ri: - case X86::OR32ri8: - case X86::OR64rr: - case X86::OR64ri8: - case X86::OR64ri32: - case X86::SBB8rr: - case X86::SBB8ri: - case X86::SBB16rr: - case X86::SBB16ri: - case X86::SBB16ri8: - case X86::SBB32rr: - case X86::SBB32ri: - case X86::SBB32ri8: - case X86::SBB64rr: - case X86::SBB64ri8: - case X86::SBB64ri32: - case X86::SUB8rr: - case X86::SUB8ri: - case X86::SUB16rr: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB32rr: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB64rr: - case X86::SUB64ri8: - case X86::SUB64ri32: - case X86::XOR8rr: - case X86::XOR8ri: - case X86::XOR16rr: - case X86::XOR16ri: - case X86::XOR16ri8: - case X86::XOR32rr: - case X86::XOR32ri: - case X86::XOR32ri8: - case X86::XOR64rr: - case X86::XOR64ri8: - case X86::XOR64ri32: + if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) || + isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode)) + return true; // Arithmetic with just 32-bit and 64-bit variants and no immediates. - case X86::ADCX32rr: - case X86::ADCX64rr: - case X86::ADOX32rr: - case X86::ADOX64rr: - case X86::ANDN32rr: - case X86::ANDN64rr: + if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode)) + return true; // Unary arithmetic operations. - case X86::DEC8r: - case X86::DEC16r: - case X86::DEC32r: - case X86::DEC64r: - case X86::INC8r: - case X86::INC16r: - case X86::INC32r: - case X86::INC64r: - case X86::NEG8r: - case X86::NEG16r: - case X86::NEG32r: - case X86::NEG64r: - + if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode)) + return true; // Unlike other arithmetic, NOT doesn't set EFLAGS. - case X86::NOT8r: - case X86::NOT16r: - case X86::NOT32r: - case X86::NOT64r: - + if (isNOT(Opcode)) + return true; // Various move instructions used to zero or sign extend things. Note that we // intentionally don't support the _NOREX variants as we can't handle that // register constraint anyways. - case X86::MOVSX16rr8: - case X86::MOVSX32rr8: - case X86::MOVSX32rr16: - case X86::MOVSX64rr8: - case X86::MOVSX64rr16: - case X86::MOVSX64rr32: - case X86::MOVZX16rr8: - case X86::MOVZX32rr8: - case X86::MOVZX32rr16: - case X86::MOVZX64rr8: - case X86::MOVZX64rr16: - case X86::MOV32rr: - + if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode)) + return true; // Arithmetic instructions that are both constant time and don't set flags. - case X86::RORX32ri: - case X86::RORX64ri: - case X86::SARX32rr: - case X86::SARX64rr: - case X86::SHLX32rr: - case X86::SHLX64rr: - case X86::SHRX32rr: - case X86::SHRX64rr: - + if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode)) + return true; // LEA doesn't actually access memory, and its arithmetic is constant time. - case X86::LEA16r: - case X86::LEA32r: - case X86::LEA64_32r: - case X86::LEA64r: + if (isLEA(Opcode)) return true; - } + // By default, assume that the instruction is not data invariant. + return false; } bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { @@ -990,6 +765,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: + case X86::FsFLD0SH: case X86::FsFLD0F128: case X86::KSET0D: case X86::KSET0Q: @@ -1192,6 +968,102 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { return ShAmt < 4 && ShAmt > 0; } +static bool findRedundantFlagInstr(MachineInstr &CmpInstr, + MachineInstr &CmpValDefInstr, + const MachineRegisterInfo *MRI, + MachineInstr **AndInstr, + const TargetRegisterInfo *TRI, + bool &NoSignFlag, bool &ClearsOverflowFlag) { + if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG) + return false; + + if (CmpInstr.getOpcode() != X86::TEST64rr) + return false; + + // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare` + // guarantees that it's analyzable only if two registers are identical. + assert( + (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) && + "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` " + "requires two reg operands are the same."); + + // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that + // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case + // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is + // redundant. + assert( + (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) && + "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG."); + + // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically + // 0. + if (CmpValDefInstr.getOperand(1).getImm() != 0) + return false; + + // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically + // sub_32bit or sub_xmm. + if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit) + return false; + + MachineInstr *VregDefInstr = + MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg()); + + assert(VregDefInstr && "Must have a definition (SSA)"); + + // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB + // to simplify the subsequent analysis. + // + // FIXME: If `VregDefInstr->getParent()` is the only predecessor of + // `CmpValDefInstr.getParent()`, this could be handled. + if (VregDefInstr->getParent() != CmpValDefInstr.getParent()) + return false; + + if (X86::isAND(VregDefInstr->getOpcode())) { + // Get a sequence of instructions like + // %reg = and* ... // Set EFLAGS + // ... // EFLAGS not changed + // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit + // test64rr %extended_reg, %extended_reg, implicit-def $eflags + // + // If subsequent readers use a subset of bits that don't change + // after `and*` instructions, it's likely that the test64rr could + // be optimized away. + for (const MachineInstr &Instr : + make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)), + MachineBasicBlock::iterator(CmpValDefInstr))) { + // There are instructions between 'VregDefInstr' and + // 'CmpValDefInstr' that modifies EFLAGS. + if (Instr.modifiesRegister(X86::EFLAGS, TRI)) + return false; + } + + *AndInstr = VregDefInstr; + + // AND instruction will essentially update SF and clear OF, so + // NoSignFlag should be false in the sense that SF is modified by `AND`. + // + // However, the implementation artifically sets `NoSignFlag` to true + // to poison the SF bit; that is to say, if SF is looked at later, the + // optimization (to erase TEST64rr) will be disabled. + // + // The reason to poison SF bit is that SF bit value could be different + // in the `AND` and `TEST` operation; signed bit is not known for `AND`, + // and is known to be 0 as a result of `TEST64rr`. + // + // FIXME: As opposed to poisoning the SF bit directly, consider peeking into + // the AND instruction and using the static information to guide peephole + // optimization if possible. For example, it's possible to fold a + // conditional move into a copy if the relevant EFLAG bits could be deduced + // from an immediate operand of and operation. + // + NoSignFlag = true; + // ClearsOverflowFlag is true for AND operation (no surprise). + ClearsOverflowFlag = true; + return true; + } + return false; +} + bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, @@ -1314,8 +1186,11 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, case X86::SHL8ri: case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); - MIB.addReg(0).addImm(1ULL << ShAmt) - .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); + MIB.addReg(0) + .addImm(1LL << ShAmt) + .addReg(InRegLEA, RegState::Kill) + .addImm(0) + .addReg(0); break; } case X86::INC8r: @@ -1478,7 +1353,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) .add(Dest) .addReg(0) - .addImm(1ULL << ShAmt) + .addImm(1LL << ShAmt) .add(Src) .addImm(0) .addReg(0); @@ -1502,7 +1377,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(0) - .addImm(1ULL << ShAmt) + .addImm(1LL << ShAmt) .addReg(SrcReg, getKillRegState(isKill)) .addImm(0) .addReg(0); @@ -1957,14 +1832,13 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( FMAForms[0] = FMA3Group.get132Opcode(); FMAForms[1] = FMA3Group.get213Opcode(); FMAForms[2] = FMA3Group.get231Opcode(); - unsigned FormIndex; - for (FormIndex = 0; FormIndex < 3; FormIndex++) - if (Opc == FMAForms[FormIndex]) - break; // Everything is ready, just adjust the FMA opcode and return it. - FormIndex = FormMapping[Case][FormIndex]; - return FMAForms[FormIndex]; + for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++) + if (Opc == FMAForms[FormIndex]) + return FMAForms[FormMapping[Case][FormIndex]]; + + llvm_unreachable("Illegal FMA3 format"); } static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, @@ -2141,7 +2015,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, if ((MI.getOperand(3).getImm() ^ Mask) == 1) { auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - WorkingMI.RemoveOperand(3); + WorkingMI.removeOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); @@ -2238,7 +2112,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::MOVSDrr)); - WorkingMI.RemoveOperand(3); + WorkingMI.removeOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -2813,34 +2687,37 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const { return false; } +int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) { + unsigned Opcode = MCID.getOpcode(); + if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode))) + return -1; + // Assume that condition code is always the last use operand. + unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs(); + return NumUses - 1; +} + +X86::CondCode X86::getCondFromMI(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + int CondNo = getCondSrcNoFromDesc(MCID); + if (CondNo < 0) + return X86::COND_INVALID; + CondNo += MCID.getNumDefs(); + return static_cast(MI.getOperand(CondNo).getImm()); +} + X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::JCC_1: - return static_cast( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } -/// Return condition code of a SETCC opcode. X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::SETCCr: case X86::SETCCm: - return static_cast( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } -/// Return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return X86::COND_INVALID; - case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: - case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm: - return static_cast( - MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); - } + return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI) + : X86::COND_INVALID; } /// Return the inverse of the specified condition, @@ -3166,8 +3043,7 @@ bool X86InstrInfo::AnalyzeBranchImpl( } // If the block has any instructions after a JMP, delete them. - while (std::next(I) != MBB.end()) - std::next(I)->eraseFromParent(); + MBB.erase(std::next(I), MBB.end()); Cond.clear(); FBB = nullptr; @@ -3464,7 +3340,7 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. - if (!Subtarget.hasCMov()) + if (!Subtarget.canUseCMOV()) return false; if (Cond.size() != 1) return false; @@ -3708,10 +3584,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg, case 2: if (X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; - if (X86::FR16XRegClass.hasSubClassEq(RC)) { - assert(STI.hasFP16()); - return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; - } assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: @@ -3739,6 +3611,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg, X86::VK8PAIRRegClass.hasSubClassEq(RC) || X86::VK16PAIRRegClass.hasSubClassEq(RC)) return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; + if ((X86::FR16RegClass.hasSubClassEq(RC) || + X86::FR16XRegClass.hasSubClassEq(RC)) && + STI.hasFP16()) + return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) @@ -3845,6 +3721,35 @@ X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, return AM; } +bool X86InstrInfo::verifyInstruction(const MachineInstr &MI, + StringRef &ErrInfo) const { + Optional AMOrNone = getAddrModeFromMemoryOp(MI, nullptr); + if (!AMOrNone) + return true; + + ExtAddrMode AM = *AMOrNone; + + if (AM.ScaledReg != X86::NoRegister) { + switch (AM.Scale) { + case 1: + case 2: + case 4: + case 8: + break; + default: + ErrInfo = "Scale factor in address must be 1, 2, 4 or 8"; + return false; + } + } + if (!isInt<32>(AM.Displacement)) { + ErrInfo = "Displacement in address must fit into 32-bit signed " + "integer"; + return false; + } + + return true; +} + bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { @@ -3949,12 +3854,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILESTORED; // tilestored %tmm, (%sp, %idx) - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); MachineInstr *NewMI = @@ -3963,6 +3868,14 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr + : Subtarget.hasAVX() ? X86::VMOVSSmr + : X86::MOVSSmr; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3991,6 +3904,14 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm + : Subtarget.hasAVX() ? X86::VMOVSSrm + : X86::MOVSSrm; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -4375,7 +4296,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; } CmpInstr.setDesc(get(NewOpcode)); - CmpInstr.RemoveOperand(0); + CmpInstr.removeOperand(0); // Mutating this instruction invalidates any debug data associated with it. CmpInstr.dropDebugNumber(); // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. @@ -4423,6 +4344,23 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, MI = &Inst; break; } + + // Look back for the following pattern, in which case the test64rr + // instruction could be erased. + // + // Example: + // %reg = and32ri %in_reg, 5 + // ... // EFLAGS not changed. + // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index + // test64rr %src_reg, %src_reg, implicit-def $eflags + MachineInstr *AndInstr = nullptr; + if (IsCmpZero && + findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI, + NoSignFlag, ClearsOverflowFlag)) { + assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode())); + MI = AndInstr; + break; + } // Cannot find other candidates before definition of SrcReg. return false; } @@ -4524,6 +4462,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: + // If SF is used, but the instruction doesn't update the SF, then we + // can't do the optimization. + if (NoSignFlag) + return false; + LLVM_FALLTHROUGH; case X86::COND_O: case X86::COND_NO: // If OF is used, the instruction needs to clear it like CmpZero does. if (!ClearsOverflowFlag) @@ -4811,7 +4754,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm); MIB->setDesc(TII.get(X86::POP32r)); } - MIB->RemoveOperand(1); + MIB->removeOperand(1); MIB->addImplicitDefUseOperands(*MBB.getParent()); // Build CFI if necessary. @@ -4918,7 +4861,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { MIB->setDesc(Desc); int64_t ShiftAmt = MIB->getOperand(2).getImm(); // Temporarily remove the immediate so we can add another source register. - MIB->RemoveOperand(2); + MIB->removeOperand(2); // Add the register. Don't copy the kill flag if there is one. MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef())); @@ -4949,6 +4892,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0SH: case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { @@ -5026,7 +4970,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; - MI.RemoveOperand(1); + MI.removeOperand(1); MIB->setDesc(get(Opc)); // VPTERNLOG needs 3 register inputs and an immediate. // 0xff will return 1s for any input. @@ -5165,6 +5109,255 @@ static bool hasPartialRegUpdate(unsigned Opcode, case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; + case X86::VFCMULCPHZ128rm: + case X86::VFCMULCPHZ128rmb: + case X86::VFCMULCPHZ128rmbkz: + case X86::VFCMULCPHZ128rmkz: + case X86::VFCMULCPHZ128rr: + case X86::VFCMULCPHZ128rrkz: + case X86::VFCMULCPHZ256rm: + case X86::VFCMULCPHZ256rmb: + case X86::VFCMULCPHZ256rmbkz: + case X86::VFCMULCPHZ256rmkz: + case X86::VFCMULCPHZ256rr: + case X86::VFCMULCPHZ256rrkz: + case X86::VFCMULCPHZrm: + case X86::VFCMULCPHZrmb: + case X86::VFCMULCPHZrmbkz: + case X86::VFCMULCPHZrmkz: + case X86::VFCMULCPHZrr: + case X86::VFCMULCPHZrrb: + case X86::VFCMULCPHZrrbkz: + case X86::VFCMULCPHZrrkz: + case X86::VFMULCPHZ128rm: + case X86::VFMULCPHZ128rmb: + case X86::VFMULCPHZ128rmbkz: + case X86::VFMULCPHZ128rmkz: + case X86::VFMULCPHZ128rr: + case X86::VFMULCPHZ128rrkz: + case X86::VFMULCPHZ256rm: + case X86::VFMULCPHZ256rmb: + case X86::VFMULCPHZ256rmbkz: + case X86::VFMULCPHZ256rmkz: + case X86::VFMULCPHZ256rr: + case X86::VFMULCPHZ256rrkz: + case X86::VFMULCPHZrm: + case X86::VFMULCPHZrmb: + case X86::VFMULCPHZrmbkz: + case X86::VFMULCPHZrmkz: + case X86::VFMULCPHZrr: + case X86::VFMULCPHZrrb: + case X86::VFMULCPHZrrbkz: + case X86::VFMULCPHZrrkz: + case X86::VFCMULCSHZrm: + case X86::VFCMULCSHZrmkz: + case X86::VFCMULCSHZrr: + case X86::VFCMULCSHZrrb: + case X86::VFCMULCSHZrrbkz: + case X86::VFCMULCSHZrrkz: + case X86::VFMULCSHZrm: + case X86::VFMULCSHZrmkz: + case X86::VFMULCSHZrr: + case X86::VFMULCSHZrrb: + case X86::VFMULCSHZrrbkz: + case X86::VFMULCSHZrrkz: + return Subtarget.hasMULCFalseDeps(); + case X86::VPERMDYrm: + case X86::VPERMDYrr: + case X86::VPERMQYmi: + case X86::VPERMQYri: + case X86::VPERMPSYrm: + case X86::VPERMPSYrr: + case X86::VPERMPDYmi: + case X86::VPERMPDYri: + case X86::VPERMDZ256rm: + case X86::VPERMDZ256rmb: + case X86::VPERMDZ256rmbkz: + case X86::VPERMDZ256rmkz: + case X86::VPERMDZ256rr: + case X86::VPERMDZ256rrkz: + case X86::VPERMDZrm: + case X86::VPERMDZrmb: + case X86::VPERMDZrmbkz: + case X86::VPERMDZrmkz: + case X86::VPERMDZrr: + case X86::VPERMDZrrkz: + case X86::VPERMQZ256mbi: + case X86::VPERMQZ256mbikz: + case X86::VPERMQZ256mi: + case X86::VPERMQZ256mikz: + case X86::VPERMQZ256ri: + case X86::VPERMQZ256rikz: + case X86::VPERMQZ256rm: + case X86::VPERMQZ256rmb: + case X86::VPERMQZ256rmbkz: + case X86::VPERMQZ256rmkz: + case X86::VPERMQZ256rr: + case X86::VPERMQZ256rrkz: + case X86::VPERMQZmbi: + case X86::VPERMQZmbikz: + case X86::VPERMQZmi: + case X86::VPERMQZmikz: + case X86::VPERMQZri: + case X86::VPERMQZrikz: + case X86::VPERMQZrm: + case X86::VPERMQZrmb: + case X86::VPERMQZrmbkz: + case X86::VPERMQZrmkz: + case X86::VPERMQZrr: + case X86::VPERMQZrrkz: + case X86::VPERMPSZ256rm: + case X86::VPERMPSZ256rmb: + case X86::VPERMPSZ256rmbkz: + case X86::VPERMPSZ256rmkz: + case X86::VPERMPSZ256rr: + case X86::VPERMPSZ256rrkz: + case X86::VPERMPSZrm: + case X86::VPERMPSZrmb: + case X86::VPERMPSZrmbkz: + case X86::VPERMPSZrmkz: + case X86::VPERMPSZrr: + case X86::VPERMPSZrrkz: + case X86::VPERMPDZ256mbi: + case X86::VPERMPDZ256mbikz: + case X86::VPERMPDZ256mi: + case X86::VPERMPDZ256mikz: + case X86::VPERMPDZ256ri: + case X86::VPERMPDZ256rikz: + case X86::VPERMPDZ256rm: + case X86::VPERMPDZ256rmb: + case X86::VPERMPDZ256rmbkz: + case X86::VPERMPDZ256rmkz: + case X86::VPERMPDZ256rr: + case X86::VPERMPDZ256rrkz: + case X86::VPERMPDZmbi: + case X86::VPERMPDZmbikz: + case X86::VPERMPDZmi: + case X86::VPERMPDZmikz: + case X86::VPERMPDZri: + case X86::VPERMPDZrikz: + case X86::VPERMPDZrm: + case X86::VPERMPDZrmb: + case X86::VPERMPDZrmbkz: + case X86::VPERMPDZrmkz: + case X86::VPERMPDZrr: + case X86::VPERMPDZrrkz: + return Subtarget.hasPERMFalseDeps(); + case X86::VRANGEPDZ128rmbi: + case X86::VRANGEPDZ128rmbikz: + case X86::VRANGEPDZ128rmi: + case X86::VRANGEPDZ128rmikz: + case X86::VRANGEPDZ128rri: + case X86::VRANGEPDZ128rrikz: + case X86::VRANGEPDZ256rmbi: + case X86::VRANGEPDZ256rmbikz: + case X86::VRANGEPDZ256rmi: + case X86::VRANGEPDZ256rmikz: + case X86::VRANGEPDZ256rri: + case X86::VRANGEPDZ256rrikz: + case X86::VRANGEPDZrmbi: + case X86::VRANGEPDZrmbikz: + case X86::VRANGEPDZrmi: + case X86::VRANGEPDZrmikz: + case X86::VRANGEPDZrri: + case X86::VRANGEPDZrrib: + case X86::VRANGEPDZrribkz: + case X86::VRANGEPDZrrikz: + case X86::VRANGEPSZ128rmbi: + case X86::VRANGEPSZ128rmbikz: + case X86::VRANGEPSZ128rmi: + case X86::VRANGEPSZ128rmikz: + case X86::VRANGEPSZ128rri: + case X86::VRANGEPSZ128rrikz: + case X86::VRANGEPSZ256rmbi: + case X86::VRANGEPSZ256rmbikz: + case X86::VRANGEPSZ256rmi: + case X86::VRANGEPSZ256rmikz: + case X86::VRANGEPSZ256rri: + case X86::VRANGEPSZ256rrikz: + case X86::VRANGEPSZrmbi: + case X86::VRANGEPSZrmbikz: + case X86::VRANGEPSZrmi: + case X86::VRANGEPSZrmikz: + case X86::VRANGEPSZrri: + case X86::VRANGEPSZrrib: + case X86::VRANGEPSZrribkz: + case X86::VRANGEPSZrrikz: + case X86::VRANGESDZrmi: + case X86::VRANGESDZrmikz: + case X86::VRANGESDZrri: + case X86::VRANGESDZrrib: + case X86::VRANGESDZrribkz: + case X86::VRANGESDZrrikz: + case X86::VRANGESSZrmi: + case X86::VRANGESSZrmikz: + case X86::VRANGESSZrri: + case X86::VRANGESSZrrib: + case X86::VRANGESSZrribkz: + case X86::VRANGESSZrrikz: + return Subtarget.hasRANGEFalseDeps(); + case X86::VGETMANTSSZrmi: + case X86::VGETMANTSSZrmikz: + case X86::VGETMANTSSZrri: + case X86::VGETMANTSSZrrib: + case X86::VGETMANTSSZrribkz: + case X86::VGETMANTSSZrrikz: + case X86::VGETMANTSDZrmi: + case X86::VGETMANTSDZrmikz: + case X86::VGETMANTSDZrri: + case X86::VGETMANTSDZrrib: + case X86::VGETMANTSDZrribkz: + case X86::VGETMANTSDZrrikz: + case X86::VGETMANTSHZrmi: + case X86::VGETMANTSHZrmikz: + case X86::VGETMANTSHZrri: + case X86::VGETMANTSHZrrib: + case X86::VGETMANTSHZrribkz: + case X86::VGETMANTSHZrrikz: + case X86::VGETMANTPSZ128rmbi: + case X86::VGETMANTPSZ128rmbikz: + case X86::VGETMANTPSZ128rmi: + case X86::VGETMANTPSZ128rmikz: + case X86::VGETMANTPSZ256rmbi: + case X86::VGETMANTPSZ256rmbikz: + case X86::VGETMANTPSZ256rmi: + case X86::VGETMANTPSZ256rmikz: + case X86::VGETMANTPSZrmbi: + case X86::VGETMANTPSZrmbikz: + case X86::VGETMANTPSZrmi: + case X86::VGETMANTPSZrmikz: + case X86::VGETMANTPDZ128rmbi: + case X86::VGETMANTPDZ128rmbikz: + case X86::VGETMANTPDZ128rmi: + case X86::VGETMANTPDZ128rmikz: + case X86::VGETMANTPDZ256rmbi: + case X86::VGETMANTPDZ256rmbikz: + case X86::VGETMANTPDZ256rmi: + case X86::VGETMANTPDZ256rmikz: + case X86::VGETMANTPDZrmbi: + case X86::VGETMANTPDZrmbikz: + case X86::VGETMANTPDZrmi: + case X86::VGETMANTPDZrmikz: + return Subtarget.hasGETMANTFalseDeps(); + case X86::VPMULLQZ128rm: + case X86::VPMULLQZ128rmb: + case X86::VPMULLQZ128rmbkz: + case X86::VPMULLQZ128rmkz: + case X86::VPMULLQZ128rr: + case X86::VPMULLQZ128rrkz: + case X86::VPMULLQZ256rm: + case X86::VPMULLQZ256rmb: + case X86::VPMULLQZ256rmbkz: + case X86::VPMULLQZ256rmkz: + case X86::VPMULLQZ256rr: + case X86::VPMULLQZ256rrkz: + case X86::VPMULLQZrm: + case X86::VPMULLQZrmb: + case X86::VPMULLQZrmbkz: + case X86::VPMULLQZrmkz: + case X86::VPMULLQZrr: + case X86::VPMULLQZrrkz: + return Subtarget.hasMULLQFalseDeps(); // GPR case X86::POPCNT32rm: case X86::POPCNT32rr: @@ -5591,6 +5784,28 @@ void X86InstrInfo::breakPartialRegDependency( .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR128XRegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Since vxorps requires AVX512DQ, vpxord should be the best choice. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256XRegClass.contains(Reg) || + X86::VR512RegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Use vpxord to clear the full ymm/zmm register. + // It wants to read and write the xmm sub-register. + Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. @@ -6413,6 +6628,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: Alignment = Align(2); break; @@ -6451,6 +6667,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: @@ -6490,7 +6707,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); - else if (Opc == X86::AVX512_FsFLD0SH) + else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH) Ty = Type::getHalfTy(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), @@ -7170,7 +7387,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, // ENDBR instructions should not be scheduled around. unsigned Opcode = MI.getOpcode(); if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 || - Opcode == X86::LDTILECFG) + Opcode == X86::PLDTILECFGV) return true; return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); @@ -9298,12 +9515,10 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo( // We check to see if CFI Instructions are present, and if they are // we find the number of CFI Instructions in the candidates. unsigned CFICount = 0; - MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); - for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); - Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { - if (MBBI->isCFIInstruction()) + for (auto &I : make_range(RepeatedSequenceLocs[0].front(), + std::next(RepeatedSequenceLocs[0].back()))) { + if (I.isCFIInstruction()) CFICount++; - MBBI++; } // We compare the number of found CFI Instructions to the number of CFI @@ -9440,7 +9655,7 @@ MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, - const outliner::Candidate &C) const { + outliner::Candidate &C) const { // Is it a tail call? if (C.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 33ce55bbdb2b..4943d2152fd2 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -40,13 +40,21 @@ std::pair getX86ConditionCode(CmpInst::Predicate Predicate); /// Return a cmov opcode for the given register size in bytes, and operand type. unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false); -// Turn jCC instruction into condition code. +/// Return the source operand # for condition code by \p MCID. If the +/// instruction doesn't have a condition code, return -1. +int getCondSrcNoFromDesc(const MCInstrDesc &MCID); + +/// Return the condition code of the instruction. If the instruction doesn't +/// have a condition code, return X86::COND_INVALID. +CondCode getCondFromMI(const MachineInstr &MI); + +// Turn JCC instruction into condition code. CondCode getCondFromBranch(const MachineInstr &MI); -// Turn setCC instruction into condition code. +// Turn SETCC instruction into condition code. CondCode getCondFromSETCC(const MachineInstr &MI); -// Turn CMov instruction into condition code. +// Turn CMOV instruction into condition code. CondCode getCondFromCMov(const MachineInstr &MI); /// GetOppositeBranchCondition - Return the inverse of the specified cond, @@ -552,8 +560,10 @@ public: MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, - const outliner::Candidate &C) const override; + outliner::Candidate &C) const override; + bool verifyInstruction(const MachineInstr &MI, + StringRef &ErrInfo) const override; #define GET_INSTRINFO_HELPER_DECLS #include "X86GenInstrInfo.inc" diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index fee9939b8dfc..7f6ef3479d40 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -388,17 +388,19 @@ def X86AbsMemAsmOperand : AsmOperandClass { } class X86MemOperand : Operand { + AsmOperandClass parserMatchClass = X86MemAsmOperand, + int size = 0> : Operand { let PrintMethod = printMethod; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); let ParserMatchClass = parserMatchClass; let OperandType = "OPERAND_MEMORY"; + int Size = size; } // Gather mem operands class X86VMemOperand - : X86MemOperand { + AsmOperandClass parserMatchClass, int size = 0> + : X86MemOperand { let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG); } @@ -413,48 +415,45 @@ def opaquemem : X86MemOperand<"printMemReference">; def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>; -def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>; -def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; -def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; -def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; -def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; -def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; -def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; -def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; -def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; -def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; -def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>; -def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; -def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; -def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; +def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand, 8>; +def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand, 16>; +def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand, 32>; +def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64>; +def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>; +def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>; +def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>; +def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand, 16>; +def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand, 32>; +def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64>; +def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand, 80>; +def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>; +def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>; +def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>; // Gather mem operands -def vx64mem : X86VMemOperand; -def vx128mem : X86VMemOperand; -def vx256mem : X86VMemOperand; -def vy128mem : X86VMemOperand; -def vy256mem : X86VMemOperand; - -def vx64xmem : X86VMemOperand; -def vx128xmem : X86VMemOperand; -def vx256xmem : X86VMemOperand; -def vy128xmem : X86VMemOperand; -def vy256xmem : X86VMemOperand; -def vy512xmem : X86VMemOperand; -def vz256mem : X86VMemOperand; -def vz512mem : X86VMemOperand; +def vx64mem : X86VMemOperand; +def vx128mem : X86VMemOperand; +def vx256mem : X86VMemOperand; +def vy128mem : X86VMemOperand; +def vy256mem : X86VMemOperand; + +def vx64xmem : X86VMemOperand; +def vx128xmem : X86VMemOperand; +def vx256xmem : X86VMemOperand; +def vy128xmem : X86VMemOperand; +def vy256xmem : X86VMemOperand; +def vy512xmem : X86VMemOperand; +def vz256mem : X86VMemOperand; +def vz512mem : X86VMemOperand; // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead // of a plain GPR, so that it doesn't potentially require a REX prefix. def ptr_rc_norex : PointerLikeRegClass<2>; def ptr_rc_norex_nosp : PointerLikeRegClass<3>; -def i8mem_NOREX : Operand { - let PrintMethod = "printbytemem"; +def i8mem_NOREX : X86MemOperand<"printbytemem", X86Mem8AsmOperand, 8> { let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, SEGMENT_REG); - let ParserMatchClass = X86Mem8AsmOperand; - let OperandType = "OPERAND_MEMORY"; } // GPRs available for tailcall. @@ -840,11 +839,11 @@ def VK16Pair : RegisterOperand { // Define X86-specific addressing mode. def addr : ComplexPattern; def lea32addr : ComplexPattern; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. def lea64_32addr : ComplexPattern; @@ -855,7 +854,7 @@ def tls32baseaddr : ComplexPattern; def lea64addr : ComplexPattern; def tls64addr : ComplexPattern; -def HasCMov : Predicate<"Subtarget->hasCMov()">; -def NoCMov : Predicate<"!Subtarget->hasCMov()">; +def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; +def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; -def Has3DNow : Predicate<"Subtarget->has3DNow()">; -def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def Has3DNow : Predicate<"Subtarget->hasThreeDNow()">; +def Has3DNowA : Predicate<"Subtarget->hasThreeDNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; @@ -981,8 +980,8 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; -def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; -def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def HasCX8 : Predicate<"Subtarget->hasCX8()">; +def HasCX16 : Predicate<"Subtarget->hasCX16()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; def HasKL : Predicate<"Subtarget->hasKL()">; @@ -996,25 +995,25 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, - AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">; + AssemblerPredicate<(all_of (not Is64Bit)), "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, - AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">; + AssemblerPredicate<(all_of Is64Bit), "64-bit mode">; def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; def In16BitMode : Predicate<"Subtarget->is16Bit()">, - AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">; + AssemblerPredicate<(all_of Is16Bit), "16-bit mode">; def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, - AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">; + AssemblerPredicate<(all_of (not Is16Bit)), "Not 16-bit mode">; def In32BitMode : Predicate<"Subtarget->is32Bit()">, - AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">; + AssemblerPredicate<(all_of Is32Bit), "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" "Subtarget->getFrameLowering()->hasFP(*MF)"> { let RecomputePerFunction = 1; } -def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; -def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; +def IsPS : Predicate<"Subtarget->isTargetPS()">; +def NotPS : Predicate<"!Subtarget->isTargetPS()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; @@ -2229,13 +2228,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), - "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>; + "cmpxchg8b\t$dst", []>, TB, Requires<[HasCX8]>; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in // NOTE: In64BitMode check needed for the AssemblerPredicate. def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", []>, - TB, Requires<[HasCmpxchg16b,In64BitMode]>; + TB, Requires<[HasCX16,In64BitMode]>; } // SchedRW, mayLoad, mayStore, hasSideEffects @@ -2851,7 +2850,7 @@ let SchedRW = [WriteSystem] in { def TPAUSE : I<0xAE, MRM6r, (outs), (ins GR32orGR64:$src), "tpause\t$src", [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>, - PD, Requires<[HasWAITPKG]>, NotMemoryFoldable; + PD, Requires<[HasWAITPKG]>; } } // SchedRW @@ -2939,7 +2938,7 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>; let SchedRW = [WriteSystem] in { let Uses = [EAX, EDX] in def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins), - "invlpgb}", []>, + "invlpgb", []>, PS, Requires<[Not64BitMode]>; let Uses = [RAX, EDX] in def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins), @@ -3124,7 +3123,7 @@ def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), let Predicates = [HasCLWB], SchedRW = [WriteLoad] in def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", - [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable; + [(int_x86_clwb addr:$src)]>, PD; let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index aeecc25ddea2..4196aff240c4 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -211,10 +211,10 @@ def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem, +def MMX_MOVD64from64mr : MMXRI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", []>, - Sched<[SchedWriteVecMoveLS.MMX.MR]>; + Sched<[SchedWriteVecMoveLS.MMX.MR]>, NotMemoryFoldable; let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in { let canFoldAsLoad = 1 in diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 035f139e6f33..06cb280e860a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -112,6 +112,8 @@ multiclass sse12_fp_packed_logical_rm opc, RegisterClass RC, Domain d, // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", + [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", @@ -3471,9 +3473,9 @@ defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, +defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; -defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, +defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, SchedWriteVecIMul, 1, NoVLX>; @@ -3965,6 +3967,20 @@ defm PINSRW : sse2_pinsrw, PD; } // ExeDomain = SSEPackedInt +// Always select FP16 instructions if available. +let Predicates = [UseSSE2], AddedComplexity = -10 in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + +let Predicates = [HasAVX, NoBWI] in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Mask Creation //===---------------------------------------------------------------------===// @@ -3997,7 +4013,10 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), //===---------------------------------------------------------------------===// let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { -let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in +// As VEX does not have separate instruction contexts for address size +// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict. +// Prefer VMASKMODDQU64. +let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", @@ -4008,32 +4027,16 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, - VEX, VEX_WIG, AdSize64; -let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in -def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs), - (ins VR128:$src, VR128:$mask), "", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, - VEX, VEX_WIG, AdSize32 { - let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}"; - let AsmVariantName = "NonParsable"; -} + VEX, VEX_WIG; -let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in +let Uses = [EDI], Predicates = [UseSSE2] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, - AdSize64; -let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in -def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), - "addr32 maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, - AdSize32 { - let AsmVariantName = "NonParsable"; -} + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; } // ExeDomain = SSEPackedInt @@ -5206,6 +5209,12 @@ let Predicates = [HasAVX, NoBWI] in defm PEXTRW : SS41I_extract16<0x15, "pextrw">; +let Predicates = [UseSSE41] in + def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + +let Predicates = [HasAVX, NoBWI] in + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract32 opc, string OpcodeStr> { @@ -7588,6 +7597,21 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { (VPBROADCASTWYrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit))))>; + + def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), + (VPBROADCASTWYrm addr:$src)>; + + def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))), + (VPBROADCASTWrr VR128:$src)>; + def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))), + (VPBROADCASTWYrr VR128:$src)>; + + def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))), + (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>; + def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))), + (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>; } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td index b4dd99d08a62..3a653a56e534 100644 --- a/llvm/lib/Target/X86/X86InstrSystem.td +++ b/llvm/lib/Target/X86/X86InstrSystem.td @@ -25,18 +25,18 @@ let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in { def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; def UD1Wm : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2), - "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16; + "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16; def UD1Lm : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2), - "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32; + "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32; def UD1Qm : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2), - "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB; + "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB; def UD1Wr : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2), - "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16; + "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16; def UD1Lr : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2), - "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32; + "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32; def UD1Qr : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2), - "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB; + "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } let isTerminator = 1 in @@ -71,9 +71,9 @@ def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB, } // SchedRW def : Pat<(debugtrap), - (INT3)>, Requires<[NotPS4]>; + (INT3)>, Requires<[NotPS]>; def : Pat<(debugtrap), - (INT (i8 0x41))>, Requires<[IsPS4]>; + (INT (i8 0x41))>, Requires<[IsPS]>; //===----------------------------------------------------------------------===// // Input/Output Instructions. diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td index 28563eeb4484..7671eb4676ee 100644 --- a/llvm/lib/Target/X86/X86InstrTSX.td +++ b/llvm/lib/Target/X86/X86InstrTSX.td @@ -51,6 +51,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), // HLE prefixes let SchedRW = [WriteSystem] in { +// XACQUIRE and XRELEASE reuse REPNE and REP respectively. +// For now, just prefer the REP versions. let isAsmParserOnly = 1 in { def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>; def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>; diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index 2429aa113fb1..e6ecbb652100 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -17,6 +17,8 @@ let Predicates = [NoAVX512] in { // A vector extract of the first f32/f64 position is a subregister copy + def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>; def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), @@ -34,8 +36,8 @@ let Predicates = [HasAVX512] in { } let Predicates = [NoVLX] in { - def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), - (COPY_TO_REGCLASS FR16X:$src, VR128)>; + def : Pat<(v8f16 (scalar_to_vector FR16:$src)), + (COPY_TO_REGCLASS FR16:$src, VR128)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (COPY_TO_REGCLASS FR32:$src, VR128)>; diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index a5976b7d2d74..d89e481f4522 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -13,11 +13,11 @@ multiclass xop2op opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP, XOP, Sched<[SchedWritePHAdd.XMM]>; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWriteVecALU.XMM]>; def rm : IXOP, XOP, - Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>; + Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } let ExeDomain = SSEPackedInt in { diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp index 28d57ca9ae3c..ff701159b95e 100644 --- a/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -21,7 +21,6 @@ #include "X86TargetMachine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" -#include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -31,6 +30,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterBank.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" @@ -179,6 +179,8 @@ X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const { return &X86::GR64RegClass; } if (RB.getID() == X86::VECRRegBankID) { + if (Ty.getSizeInBits() == 16) + return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; if (Ty.getSizeInBits() == 32) return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; if (Ty.getSizeInBits() == 64) @@ -516,7 +518,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, // is already on the instruction we're mutating, and thus we don't need to // make any changes. So long as we select an opcode which is capable of // loading or storing the appropriate size atomically, the rest of the - // backend is required to respect the MMO state. + // backend is required to respect the MMO state. if (!MemOp.isUnordered()) { LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n"); return false; @@ -537,12 +539,12 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); if (Opc == TargetOpcode::G_LOAD) { - I.RemoveOperand(1); + I.removeOperand(1); addFullAddress(MIB, AM); } else { // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL) - I.RemoveOperand(1); - I.RemoveOperand(0); + I.removeOperand(1); + I.removeOperand(0); addFullAddress(MIB, AM).addUse(DefReg); } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); @@ -625,7 +627,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); - I.RemoveOperand(1); + I.removeOperand(1); addFullAddress(MIB, AM); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); @@ -1412,7 +1414,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, - MF.getDataLayout().getPointerSize(), Alignment); + LLT::pointer(0, MF.getDataLayout().getPointerSizeInBits()), Alignment); LoadInst = addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 1edec96bbec3..3c8be95b43e3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -371,8 +371,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -818,8 +818,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1281,8 +1281,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 4710e524931c..23976fb1a142 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -558,7 +558,7 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( } // Find and eliminate gadget edges that have been mitigated. - int MitigatedGadgets = 0, RemainingGadgets = 0; + int RemainingGadgets = 0; NodeSet ReachableNodes{G}; for (const Node &RootN : G.nodes()) { if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge)) @@ -586,7 +586,6 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( // This gadget's sink is reachable ++RemainingGadgets; } else { // This gadget's sink is unreachable, and therefore mitigated - ++MitigatedGadgets; ElimEdges.insert(E); } } diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 6b564a0356a6..70964b352b8c 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 6206d8efb3d0..540182cb7911 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,6 +74,24 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic(m_Value())); } +static bool isAMXIntrinsic(Value *I) { + auto *II = dyn_cast(I); + if (!II) + return false; + if (isAMXCast(II)) + return false; + // Check if return type or parameter is x86_amx. If it is x86_amx + // the intrinsic must be x86 amx intrinsics. + if (II->getType()->isX86_AMXTy()) + return true; + for (Value *V : II->args()) { + if (V->getType()->isX86_AMXTy()) + return true; + } + + return false; +} + static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty) { Function &F = *BB->getParent(); @@ -162,6 +180,36 @@ static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { return std::make_pair(Row, Col); } +static std::pair getShape(PHINode *Phi) { + Use &U = *(Phi->use_begin()); + unsigned OpNo = U.getOperandNo(); + User *V = U.getUser(); + // TODO We don't traverse all users. To make the algorithm simple, here we + // just traverse the first user. If we can find shape, then return the shape, + // otherwise just return nullptr and the optimization for undef/zero will be + // abandoned. + while (V) { + if (isAMXCast(dyn_cast(V))) { + if (V->use_empty()) + break; + Use &U = *(V->use_begin()); + OpNo = U.getOperandNo(); + V = U.getUser(); + } else if (isAMXIntrinsic(V)) { + return getShape(cast(V), OpNo); + } else if (isa(V)) { + if (V->use_empty()) + break; + Use &U = *(V->use_begin()); + V = U.getUser(); + } else { + break; + } + } + + return std::make_pair(nullptr, nullptr); +} + namespace { class X86LowerAMXType { Function &Func; @@ -655,6 +703,9 @@ class X86LowerAMXCast { public: X86LowerAMXCast(Function &F) : Func(F) {} + void combineCastStore(IntrinsicInst *Cast, StoreInst *ST); + void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); + bool combineLdSt(SmallVectorImpl &Casts); bool combineAMXcast(TargetLibraryInfo *TLI); bool transformAMXCast(IntrinsicInst *AMXCast); bool transformAllAMXCast(); @@ -720,11 +771,33 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( OldPhiNodes.insert(PN); while (!PhiWorklist.empty()) { auto *OldPN = PhiWorklist.pop_back_val(); - for (Value *IncValue : OldPN->incoming_values()) { + for (unsigned I = 0; I < OldPN->getNumOperands(); ++I) { + Value *IncValue = OldPN->getIncomingValue(I); // TODO: currently, We ignore cases where it is a const. In the future, we // might support const. - if (isa(IncValue)) - return false; + if (isa(IncValue)) { + auto *IncConst = dyn_cast(IncValue); + if (!isa(IncValue) && !IncConst->isZeroValue()) + return false; + Value *Row = nullptr, *Col = nullptr; + std::tie(Row, Col) = getShape(OldPN); + // TODO: If it is not constant the Row and Col must domoniate tilezero + // that we are going to create. + if (!Row || !Col || !isa(Row) || !isa(Col)) + return false; + // Create tilezero at the end of incoming block. + auto *Block = OldPN->getIncomingBlock(I); + BasicBlock::iterator Iter = Block->getTerminator()->getIterator(); + Instruction *NewInst = Builder.CreateIntrinsic( + Intrinsic::x86_tilezero_internal, None, {Row, Col}); + NewInst->moveBefore(&*Iter); + NewInst = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {IncValue->getType()}, {NewInst}); + NewInst->moveBefore(&*Iter); + // Replace InValue with new Value. + OldPN->setIncomingValue(I, NewInst); + IncValue = NewInst; + } if (auto *PNode = dyn_cast(IncValue)) { if (OldPhiNodes.insert(PNode)) @@ -838,6 +911,99 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } +// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) +// store <256 x i32> %43, <256 x i32>* %p, align 64 +// --> +// call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p, +// i64 64, x86_amx %42) +void X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { + Value *Tile = Cast->getOperand(0); + // TODO: If it is cast intrinsic or phi node, we can propagate the + // shape information through def-use chain. + if (!isAMXIntrinsic(Tile)) + return; + auto *II = cast(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + IRBuilder<> Builder(ST); + // Use the maximum column as stride. It must be the same with load + // stride. + Value *Stride = Builder.getInt64(64); + Value *I8Ptr = + Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy()); + std::array Args = {Row, Col, I8Ptr, Stride, Tile}; + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); +} + +// %65 = load <256 x i32>, <256 x i32>* %p, align 64 +// %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65) +// --> +// %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, +// i8* %p, i64 64) +void X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { + Value *Row = nullptr, *Col = nullptr; + Use &U = *(Cast->use_begin()); + unsigned OpNo = U.getOperandNo(); + auto *II = cast(U.getUser()); + // TODO: If it is cast intrinsic or phi node, we can propagate the + // shape information through def-use chain. + if (!isAMXIntrinsic(II)) + return; + std::tie(Row, Col) = getShape(II, OpNo); + IRBuilder<> Builder(LD); + // Use the maximun column as stride. + Value *Stride = Builder.getInt64(64); + Value *I8Ptr = + Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy()); + std::array Args = {Row, Col, I8Ptr, Stride}; + + Value *NewInst = + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args); + Cast->replaceAllUsesWith(NewInst); +} + +bool X86LowerAMXCast::combineLdSt(SmallVectorImpl &Casts) { + bool Change = false; + for (auto *Cast : Casts) { + auto *II = cast(Cast); + // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42) + // store <256 x i32> %43, <256 x i32>* %p, align 64 + // --> + // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p, + // i64 64, x86_amx %42) + if (II->getIntrinsicID() == Intrinsic::x86_cast_tile_to_vector) { + SmallVector DeadStores; + for (User *U : Cast->users()) { + StoreInst *Store = dyn_cast(U); + if (!Store) + continue; + combineCastStore(cast(Cast), Store); + DeadStores.push_back(Store); + Change = true; + } + for (auto *Store : DeadStores) + Store->eraseFromParent(); + } else { // x86_cast_vector_to_tile + SmallVector DeadLoads; + auto *Load = dyn_cast(Cast->getOperand(0)); + if (!Load || !Load->hasOneUse()) + continue; + // %65 = load <256 x i32>, <256 x i32>* %p, align 64 + // %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65) + // --> + // %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, + // i8* %p, i64 64) + combineLoadCast(cast(Cast), Load); + // Set the operand is null so that load instruction can be erased. + Cast->setOperand(0, nullptr); + Load->eraseFromParent(); + } + } + return Change; +} + bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) { bool Change = false; // Collect tile cast instruction. @@ -879,17 +1045,22 @@ bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) { Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector); Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile); + SmallVector LiveCasts; auto EraseInst = [&](SmallVectorImpl &Insts) { for (auto *Inst : Insts) { if (Inst->use_empty()) { Inst->eraseFromParent(); Change = true; + } else { + LiveCasts.push_back(Inst); } } }; EraseInst(Vec2TileInsts); EraseInst(Tile2VecInsts); + Change |= combineLdSt(LiveCasts); + EraseInst(LiveCasts); // Handle the A->B->A cast, and there is an intervening PHI node. for (BasicBlock &BB : Func) { @@ -947,6 +1118,10 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { // i64 60) // call void @llvm.x86.tilestored64.internal(i16 15, i16 60, // i8* %addr3, i64 60, x86_amx %2) + if (AMXCast->use_empty()) { + AMXCast->eraseFromParent(); + return true; + } Use &U = *(AMXCast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = dyn_cast(U.getUser()); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 9044f10ec630..b107de692365 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -501,7 +501,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { for (const MachineOperand &MO : MI->operands()) if (auto MaybeMCOp = LowerMachineOperand(MI, MO)) - OutMI.addOperand(MaybeMCOp.getValue()); + OutMI.addOperand(*MaybeMCOp); // Handle a few special cases to eliminate operand modifiers. switch (OutMI.getOpcode()) { @@ -962,6 +962,12 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { // These are not truly commutable so hide them from the default case. break; + case X86::MASKMOVDQU: + case X86::VMASKMOVDQU: + if (AsmPrinter.getSubtarget().is64Bit()) + OutMI.setFlags(X86::IP_HAS_AD_SIZE); + break; + default: { // If the instruction is a commutable arithmetic instruction we might be // able to commute the operands to get a 2 byte VEX prefix. @@ -1311,7 +1317,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, E = FaultingMI.operands_end(); I != E; ++I) if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I)) - MI.addOperand(MaybeOperand.getValue()); + MI.addOperand(*MaybeOperand); OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); OutStreamer->emitInstruction(MI, getSubtargetInfo()); @@ -1347,11 +1353,12 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) { AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset); - std::string Name = AccessInfo.IsWrite ? "store" : "load"; - std::string Op = OrShadowOffset ? "or" : "add"; - std::string SymName = "__asan_check_" + Name + "_" + Op + "_" + - utostr(1ULL << AccessInfo.AccessSizeIndex) + "_" + - TM.getMCRegisterInfo()->getName(Reg.asMCReg()); + StringRef Name = AccessInfo.IsWrite ? "store" : "load"; + StringRef Op = OrShadowOffset ? "or" : "add"; + std::string SymName = ("__asan_check_" + Name + "_" + Op + "_" + + Twine(1ULL << AccessInfo.AccessSizeIndex) + "_" + + TM.getMCRegisterInfo()->getName(Reg.asMCReg())) + .str(); if (OrShadowOffset) report_fatal_error( "OrShadowOffset is not supported with optimized callbacks"); @@ -1375,7 +1382,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, MCI.setOpcode(Opcode); for (auto &MO : drop_begin(MI.operands(), 2)) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) - MCI.addOperand(MaybeOperand.getValue()); + MCI.addOperand(*MaybeOperand); SmallString<256> Code; SmallVector Fixups; @@ -1751,7 +1758,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, Ret.setOpcode(OpCode); for (auto &MO : drop_begin(MI.operands())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) - Ret.addOperand(MaybeOperand.getValue()); + Ret.addOperand(*MaybeOperand); OutStreamer->emitInstruction(Ret, getSubtargetInfo()); emitX86Nops(*OutStreamer, 10, Subtarget); recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2); @@ -1790,7 +1797,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, OutStreamer->AddComment("TAILCALL"); for (auto &MO : drop_begin(MI.operands())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) - TC.addOperand(MaybeOperand.getValue()); + TC.addOperand(*MaybeOperand); OutStreamer->emitInstruction(TC, getSubtargetInfo()); } @@ -1985,34 +1992,34 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { // Otherwise, use the .seh_ directives for all other Windows platforms. switch (MI->getOpcode()) { case X86::SEH_PushReg: - OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm()); + OutStreamer->emitWinCFIPushReg(MI->getOperand(0).getImm()); break; case X86::SEH_SaveReg: - OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(), + OutStreamer->emitWinCFISaveReg(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; case X86::SEH_SaveXMM: - OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(), + OutStreamer->emitWinCFISaveXMM(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; case X86::SEH_StackAlloc: - OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + OutStreamer->emitWinCFIAllocStack(MI->getOperand(0).getImm()); break; case X86::SEH_SetFrame: - OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(), + OutStreamer->emitWinCFISetFrame(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; case X86::SEH_PushFrame: - OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + OutStreamer->emitWinCFIPushFrame(MI->getOperand(0).getImm()); break; case X86::SEH_EndPrologue: - OutStreamer->EmitWinCFIEndProlog(); + OutStreamer->emitWinCFIEndProlog(); break; default: diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index 05f846bfb219..2e88e01ce7fd 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -13,6 +13,13 @@ using namespace llvm; +MachineFunctionInfo *X86MachineFunctionInfo::clone( + BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap &Src2DstMBB) + const { + return DestMF.cloneInfo(*this); +} + void X86MachineFunctionInfo::anchor() { } void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 99d1a97380dd..99cc9f525b2c 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -119,7 +119,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { Optional SwiftAsyncContextFrameIdx; - ValueMap PreallocatedIds; + // Preallocated fields are only used during isel. + // FIXME: Can we find somewhere else to store these? + DenseMap PreallocatedIds; SmallVector PreallocatedStackSizes; SmallVector, 0> PreallocatedArgOffsets; @@ -132,6 +134,12 @@ public: X86MachineFunctionInfo() = default; explicit X86MachineFunctionInfo(MachineFunction &MF) {} + explicit X86MachineFunctionInfo(const X86MachineFunctionInfo &) = default; + + MachineFunctionInfo * + clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap &Src2DstMBB) + const override; bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp index 425054cfdd92..aa6e8645e092 100644 --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index e92b1b002bb0..bb59cee8badb 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -37,21 +37,20 @@ STATISTIC(NumBBsPadded, "Number of basic blocks padded"); namespace { struct VisitedBBInfo { // HasReturn - Whether the BB contains a return instruction - bool HasReturn; + bool HasReturn = false; // Cycles - Number of cycles until return if HasReturn is true, otherwise // number of cycles until end of the BB - unsigned int Cycles; + unsigned int Cycles = 0; - VisitedBBInfo() : HasReturn(false), Cycles(0) {} + VisitedBBInfo() = default; VisitedBBInfo(bool HasReturn, unsigned int Cycles) : HasReturn(HasReturn), Cycles(Cycles) {} }; struct PadShortFunc : public MachineFunctionPass { static char ID; - PadShortFunc() : MachineFunctionPass(ID) - , Threshold(4) {} + PadShortFunc() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -82,7 +81,7 @@ namespace { MachineBasicBlock::iterator &MBBI, unsigned int NOOPsToAdd); - const unsigned int Threshold; + const unsigned int Threshold = 4; // ReturnBBs - Maps basic blocks that return to the minimum number of // cycles until the return, starting from the entry block. diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 4342ac089cae..7761f7323358 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -19,8 +19,10 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -220,16 +222,21 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { if (!cast(Op->getType())->getElementType()->isIntegerTy(32)) return false; - // Operand should be a select. - auto *SI = dyn_cast(Op); - if (!SI) - return false; - - // Select needs to implement absolute value. - Value *LHS, *RHS; - auto SPR = matchSelectPattern(SI, LHS, RHS); - if (SPR.Flavor != SPF_ABS) - return false; + Value *LHS; + if (match(Op, PatternMatch::m_Intrinsic())) { + LHS = Op->getOperand(0); + } else { + // Operand should be a select. + auto *SI = dyn_cast(Op); + if (!SI) + return false; + + Value *RHS; + // Select needs to implement absolute value. + auto SPR = matchSelectPattern(SI, LHS, RHS); + if (SPR.Flavor != SPF_ABS) + return false; + } // Need a subtract of two values. auto *Sub = dyn_cast(LHS); @@ -253,7 +260,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { if (!Op0 || !Op1) return false; - IRBuilder<> Builder(SI); + IRBuilder<> Builder(Op); auto *OpTy = cast(Op->getType()); unsigned NumElts = OpTy->getNumElements(); @@ -271,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { IntrinsicNumElts = 16; } - Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID); + Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID); if (NumElts < 16) { // Pad input with zeroes. @@ -336,8 +343,8 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); } - SI->replaceAllUsesWith(Ops[0]); - SI->eraseFromParent(); + Op->replaceAllUsesWith(Ops[0]); + Op->eraseFromParent(); return true; } diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp index d9c6d08ada73..cd0d448238a6 100644 --- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp +++ b/llvm/lib/Target/X86/X86PreAMXConfig.cpp @@ -91,16 +91,17 @@ static bool brokenVolatile(Instruction *I) { namespace { class X86PreAMXConfig { + using PosAndShapesMap = MapVector>; + Function &F; public: X86PreAMXConfig(Function &Func) : F(Func) {} bool preTileConfig(); - bool addTileConfig(Instruction *ModelStart, SmallVector &Shapes); - bool findConfigShapes( - DenseMap> &PosAndShapes); + void addTileConfig(Instruction *ModelStart, SmallVector &Shapes); + bool findConfigShapes(PosAndShapesMap &PosAndShapes); bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector &Shapes); - bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos, + void preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder, SmallVector &Shapes); BasicBlock::iterator getShapesAndConfigPosEnd(BasicBlock::iterator Iter, @@ -149,10 +150,9 @@ public: // %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) // call void @llvm.x86.tilestored64.internal(... td) area // -------------------------------------------------------------------------- -bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos, +void X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder, SmallVector &Shapes) { - bool Write = false; - LLVMContext &Ctx = Pos->getParent()->getContext(); + LLVMContext &Ctx = Builder.getContext(); Type *I8Ty = Type::getInt8Ty(Ctx); Type *I16Ty = Type::getInt16Ty(Ctx); @@ -160,30 +160,27 @@ bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos, // other value in the future. Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0); Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1); - Value *PalettePos = - GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos); - new StoreInst(PaletteValue, PalettePos, Pos); + Value *PalettePos = Builder.CreateGEP(I8Ty, I8Ptr, PaletteOffset); + Builder.CreateStore(PaletteValue, PalettePos); for (int I = 0, E = Shapes.size() / 2; I < E; I++) { Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I); Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2); const std::string ShapeName = "amx.tmm." + itostr(I); - Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset, - ShapeName + ".shape.row", Pos); - Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos); - ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0), - ShapeName + ".shape.col", Pos); + Value *RowPos = Builder.CreateGEP(I8Ty, I8Ptr, RowOffset, + ShapeName + ".shape.row"); + Value *ColPos = Builder.CreateGEP(I8Ty, I8Ptr, ColOffset); + ColPos = Builder.CreateBitCast(ColPos, PointerType::get(I16Ty, 0), + ShapeName + ".shape.col"); Value *Row = Shapes[I * 2]; Value *Col = Shapes[I * 2 + 1]; - Row = new TruncInst(Row, I8Ty, "", Pos); - new StoreInst(Row, RowPos, Pos); - new StoreInst(Col, ColPos, Pos); - Write = true; + Row = Builder.CreateTrunc(Row, I8Ty); + Builder.CreateStore(Row, RowPos); + Builder.CreateStore(Col, ColPos); } - return Write; } -bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart, +void X86PreAMXConfig::addTileConfig(Instruction *ModelStart, SmallVector &Shapes) { Module *M = F.getParent(); IRBuilder<> Builder(ModelStart); @@ -198,17 +195,11 @@ bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart, Addr->setAlignment(Alignment); Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); - std::array Args = {I8Ptr}; - Instruction *Cfg = - Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args); - - Value *Val0 = Constant::getNullValue(V512Ty); - Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg); - assert(Init0 && "Not Zero initilizate the cfg mem!"); + Builder.CreateAlignedStore(Constant::getNullValue(V512Ty), Addr, Alignment); - preWriteTileCfg(I8Ptr, Cfg, Shapes); + preWriteTileCfg(I8Ptr, Builder, Shapes); - return Init0; + Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, {I8Ptr}); } // Todo: We may need to handle "more than one store" case in the future. @@ -315,8 +306,7 @@ X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter, // %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n) // call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n) // -------------------------------------------------------------------------- -bool X86PreAMXConfig::findConfigShapes( - DenseMap> &PosAndShapes) { +bool X86PreAMXConfig::findConfigShapes(PosAndShapesMap &PosAndShapes) { bool Find = false; for (BasicBlock &BB : F) { for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) { @@ -365,7 +355,7 @@ bool X86PreAMXConfig::findConfigShapes( // call void @llvm.x86.tilestored64.internal(... td) area // -------------------------------------------------------------------------- bool X86PreAMXConfig::preTileConfig() { - DenseMap> PosAndShapes; + PosAndShapesMap PosAndShapes; bool NeedCfg = findConfigShapes(PosAndShapes); if (!NeedCfg) return false; diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 5d21f8666ec6..479db8585ca0 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -40,10 +41,15 @@ using namespace llvm; #define DEBUG_TYPE "tile-pre-config" -#define REPORT_CONFIG_FAIL \ - report_fatal_error( \ - MF.getName() + \ - ": Failed to config tile register, please define the shape earlier"); + +static void emitErrorMsg(MachineFunction &MF) { + SmallString<32> Str; + Twine ErrorMsg = + MF.getName() + + ": Failed to config tile register, please define the shape earlier"; + LLVMContext &Context = MF.getMMI().getModule()->getContext(); + Context.emitError(ErrorMsg); +} namespace { @@ -302,12 +308,19 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { SmallVector WorkList; for (auto &I : ShapeBBs) { // TODO: We can hoist shapes across BBs here. - if (BBVisitedInfo[I.first].HasAMXRegLiveIn) - REPORT_CONFIG_FAIL + if (BBVisitedInfo[I.first].HasAMXRegLiveIn) { + // We are not able to config tile registers since the shape to config + // is not defined yet. Emit error message and continue. The function + // would not config tile registers. + emitErrorMsg(MF); + return false; + } if (BBVisitedInfo[I.first].FirstAMX && BBVisitedInfo[I.first].FirstAMX < I.second.back() && - !hoistShapesInBB(I.first, I.second)) - REPORT_CONFIG_FAIL + !hoistShapesInBB(I.first, I.second)) { + emitErrorMsg(MF); + return false; + } WorkList.push_back(I.first); } while (!WorkList.empty()) { @@ -356,7 +369,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { // multi insert. if (VisitedOrInserted.insert(I).second) { auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin(); - addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)), + addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::PLDTILECFGV)), SS); } } @@ -367,33 +380,27 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { MachineInstr *MI = &*MBB.begin(); if (ST.hasAVX512()) { Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); - BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm) - .addReg(Zmm, RegState::Undef) - .addReg(Zmm, RegState::Undef); + BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm); addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS) .addReg(Zmm); } else if (ST.hasAVX2()) { Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass); - BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm) - .addReg(Ymm, RegState::Undef) - .addReg(Ymm, RegState::Undef); + BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm); addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS) .addReg(Ymm); addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32) .addReg(Ymm); } else { assert(ST.hasSSE2() && "AMX should assume SSE2 enabled"); + unsigned StoreOpc = ST.hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass); - BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm) - .addReg(Xmm, RegState::Undef) - .addReg(Xmm, RegState::Undef); - addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS) - .addReg(Xmm); - addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16) + BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS).addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 16) .addReg(Xmm); - addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 32) .addReg(Xmm); - addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 48) .addReg(Xmm); } // Fill in the palette first. diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp index 9c076d2d6769..c49fc458eab3 100644 --- a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp @@ -12,9 +12,9 @@ #include "X86RegisterBankInfo.h" #include "X86InstrInfo.h" -#include "llvm/CodeGen/GlobalISel/RegisterBank.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterBank.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_TARGET_REGBANK_IMPL @@ -25,8 +25,7 @@ using namespace llvm; #define GET_TARGET_REGBANK_INFO_IMPL #include "X86GenRegisterBankInfo.def" -X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) - : X86GenRegisterBankInfo() { +X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) { // validate RegBank initialization. const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID); diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.h b/llvm/lib/Target/X86/X86RegisterBankInfo.h index d5afd2cae761..fca36a317b58 100644 --- a/llvm/lib/Target/X86/X86RegisterBankInfo.h +++ b/llvm/lib/Target/X86/X86RegisterBankInfo.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H #define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #define GET_REGBANK_DECLARATIONS #include "X86GenRegisterBank.inc" diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 130cb61cdde2..f2658f70434b 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -26,6 +26,8 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TileShapeInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" @@ -618,6 +620,66 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +bool X86RegisterInfo::isArgumentRegister(const MachineFunction &MF, + MCRegister Reg) const { + const X86Subtarget &ST = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *ST.getRegisterInfo(); + auto IsSubReg = [&](MCRegister RegA, MCRegister RegB) { + return TRI.isSuperOrSubRegisterEq(RegA, RegB); + }; + + if (!ST.is64Bit()) + return llvm::any_of( + SmallVector{X86::EAX, X86::ECX, X86::EDX}, + [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }) || + (ST.hasMMX() && X86::VR64RegClass.contains(Reg)); + + CallingConv::ID CC = MF.getFunction().getCallingConv(); + + if (CC == CallingConv::X86_64_SysV && IsSubReg(X86::RAX, Reg)) + return true; + + if (llvm::any_of( + SmallVector{X86::RDX, X86::RCX, X86::R8, X86::R9}, + [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); })) + return true; + + if (CC != CallingConv::Win64 && + llvm::any_of(SmallVector{X86::RDI, X86::RSI}, + [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); })) + return true; + + if (ST.hasSSE1() && + llvm::any_of(SmallVector{X86::XMM0, X86::XMM1, X86::XMM2, + X86::XMM3, X86::XMM4, X86::XMM5, + X86::XMM6, X86::XMM7}, + [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); })) + return true; + + return X86GenRegisterInfo::isArgumentRegister(MF, Reg); +} + +bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, + MCRegister PhysReg) const { + const X86Subtarget &ST = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *ST.getRegisterInfo(); + + // Stack pointer. + if (TRI.isSuperOrSubRegisterEq(X86::RSP, PhysReg)) + return true; + + // Don't use the frame pointer if it's being used. + const X86FrameLowering &TFI = *getFrameLowering(MF); + if (TFI.hasFP(MF) && TRI.isSuperOrSubRegisterEq(X86::RBP, PhysReg)) + return true; + + return X86GenRegisterInfo::isFixedRegister(MF, PhysReg); +} + +bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { + return RC->getID() == X86::TILERegClassID; +} + void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 7fd10ddd1a15..6f4fb405d29f 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -115,6 +115,18 @@ public: /// register scavenger to determine what registers are free. BitVector getReservedRegs(const MachineFunction &MF) const override; + /// isArgumentReg - Returns true if Reg can be used as an argument to a + /// function. + bool isArgumentRegister(const MachineFunction &MF, + MCRegister Reg) const override; + + /// Return true if it is tile register class. + bool isTileRegisterClass(const TargetRegisterClass *RC) const; + + /// Returns true if PhysReg is a fixed register. + bool isFixedRegister(const MachineFunction &MF, + MCRegister PhysReg) const override; + void adjustStackMapLiveOutMask(uint32_t *Mask) const override; bool hasBasePointer(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 1b704bcb8e08..6dc51e37d3c2 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -537,6 +537,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;} + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -599,7 +601,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; -def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>; +def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;} // Extended VR128 and VR256 for AVX-512 instructions def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], @@ -638,3 +640,14 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} + +//===----------------------------------------------------------------------===// +// Register categories. +// + +// The TILE and VK*PAIR registers may not be "fixed", but we don't want them +// anyway. +def FixedRegisters : RegisterCategory<[DEBUG_REG, CONTROL_REG, CCR, FPCCR, + DFCCR, TILE, VK1PAIR, VK2PAIR, VK4PAIR, + VK8PAIR, VK16PAIR]>; +def GeneralPurposeRegisters : RegisterCategory<[GR64, GR32, GR16, GR8]>; diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 8e317dc22bd6..e4b95cb0807f 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -814,12 +814,26 @@ def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> { def: InstRW<[BWWriteResGroup34], (instregex "CLD")>; def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> { - let Latency = 3; + let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r(1|i)", - "RCR(8|16|32|64)r(1|i)")>; +def: InstRW<[BWWriteResGroup35], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def BWWriteResGroup36 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[BWWriteResGroup36], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def BWWriteResGroup36b : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[BWWriteResGroup36b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 1cd0b3379684..7b1a31d2a4df 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -1299,12 +1299,26 @@ def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> { def: InstRW<[HWWriteResGroup58], (instregex "CLD")>; def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> { - let Latency = 3; + let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r(1|i)", - "RCR(8|16|32|64)r(1|i)")>; +def: InstRW<[HWWriteResGroup59], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def HWWriteResGroup60 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[HWWriteResGroup60], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def HWWriteResGroup60b : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[HWWriteResGroup60b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> { let Latency = 4; diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 9fd986e34181..b66db7e7e73a 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -923,12 +923,26 @@ def ICXWriteResGroup43 : SchedWriteRes<[ICXPort237,ICXPort0156]> { def: InstRW<[ICXWriteResGroup43], (instrs MFENCE)>; def ICXWriteResGroup44 : SchedWriteRes<[ICXPort06,ICXPort0156]> { - let Latency = 3; + let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[ICXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)", - "RCR(8|16|32|64)r(1|i)")>; +def: InstRW<[ICXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def ICXWriteResGroup44b : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> { + let Latency = 5; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[ICXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def ICXWriteResGroup44c : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> { + let Latency = 6; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[ICXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def ICXWriteResGroup45 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 7e619a3a8722..49858ca0a800 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -111,8 +111,17 @@ def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. def : WriteRes; -def : WriteRes { let Latency = 5; let NumMicroOps = 0; } + +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; // Arithmetic. defm : SBWriteResPair; @@ -678,13 +687,27 @@ def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> { } def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>; -def SBWriteResGroup23 : SchedWriteRes<[SBPort05]> { +def SBWriteResGroup23 : SchedWriteRes<[SBPort05,SBPort015]> { let Latency = 2; let NumMicroOps = 3; - let ResourceCycles = [3]; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup23], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def SBWriteResGroup24 : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> { + let Latency = 3; + let NumMicroOps = 8; + let ResourceCycles = [1,1,4,2]; +} +def: InstRW<[SBWriteResGroup24], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def SBWriteResGroup24b : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> { + let Latency = 4; + let NumMicroOps = 8; + let ResourceCycles = [1,1,4,2]; } -def: InstRW<[SBWriteResGroup23], (instregex "RCL(8|16|32|64)r1", - "RCR(8|16|32|64)r1")>; +def: InstRW<[SBWriteResGroup24b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> { let Latency = 7; @@ -727,8 +750,8 @@ def SBWriteResGroup76 : SchedWriteRes<[SBPort05]> { let NumMicroOps = 8; let ResourceCycles = [8]; } -def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)r(i|CL)", - "RCR(8|16|32|64)r(i|CL)")>; +def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)rCL", + "RCR(8|16|32|64)rCL")>; def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> { let Latency = 5; @@ -802,8 +825,7 @@ def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SBWriteResGroup48], (instrs MMX_MOVD64from64rm, - VBROADCASTSSrm)>; +def: InstRW<[SBWriteResGroup48], (instrs VBROADCASTSSrm)>; def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r", "(V?)MOV64toPQIrm", "(V?)MOVDDUPrm", diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 0a88bac5aa66..05364e3434e4 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -836,12 +836,26 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> { def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>; def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> { - let Latency = 3; + let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r(1|i)", - "RCR(8|16|32|64)r(1|i)")>; +def: InstRW<[SKLWriteResGroup42], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def SKLWriteResGroup42b : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[SKLWriteResGroup42b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def SKLWriteResGroup42c : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[SKLWriteResGroup42c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { let Latency = 3; @@ -921,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)", - "MOVZX(16|32|64)rm(8|16)", - "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67? + "MOVZX(16|32|64)rm(8|16)")>; def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> { let Latency = 5; @@ -979,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm, VPBROADCASTDrm, VPBROADCASTQrm)>; def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm", - "(V?)MOVSLDUPrm")>; + "(V?)MOVSLDUPrm", + "(V?)MOVDDUPrm")>; def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> { let Latency = 6; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index b28a18f0dcd7..b682b51c298a 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -905,12 +905,26 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> { def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>; def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> { - let Latency = 3; + let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)", - "RCR(8|16|32|64)r(1|i)")>; +def: InstRW<[SKXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def SKXWriteResGroup44b : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[SKXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def SKXWriteResGroup44c : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 8; + let ResourceCycles = [2,4,2]; +} +def: InstRW<[SKXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> { let Latency = 3; @@ -1041,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)", - "MOVZX(16|32|64)rm(8|16)", - "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71? + "MOVZX(16|32|64)rm(8|16)")>; def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> { let Latency = 5; @@ -1145,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> { } def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm, VPBROADCASTDrm, - VPBROADCASTQrm, - VMOVSHDUPrm, - VMOVSLDUPrm, - MOVSHDUPrm, - MOVSLDUPrm)>; + VPBROADCASTQrm)>; +def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "(V?)MOVDDUPrm")>; def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> { let Latency = 6; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 4b2fa87a25b5..1e9fcf6cc8cf 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -840,8 +840,8 @@ def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JAL let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; let NumMicroOps = 63; } -def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32, - VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>; +def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, + VMASKMOVDQU, VMASKMOVDQU64)>; /////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 52605c031617..de4e7dd3cb90 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -377,10 +377,8 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : X86WriteResPairUnsupported; -// FIXME: The below is closer to correct, but caused some perf regressions. -//defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; +defm : SLMWriteResPair; +defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; defm : SLMWriteResPair; defm : SLMWriteResPair; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index fe0484afd227..aada3e0bd906 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -189,15 +189,6 @@ defm : ZnWriteResPair; defm : ZnWriteResPair; defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; -//defm : ZnWriteResPair; defm : X86WriteRes; defm : X86WriteRes; @@ -227,12 +218,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -//defm : X86WriteRes; -//defm : X86WriteRes; // Bit counts. -defm : ZnWriteResPair; -defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; defm : ZnWriteResPair; defm : ZnWriteResPair; defm : ZnWriteResPair; @@ -240,9 +229,8 @@ defm : ZnWriteResPair; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; -// BMI1 BEXTR/BLS, BMI2 BZHI +// BMI1 BEXTR, BMI2 BZHI defm : ZnWriteResPair; -//defm : ZnWriteResPair; defm : ZnWriteResPair; // IDIV @@ -271,13 +259,13 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; - defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; @@ -288,24 +276,24 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteResUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; @@ -346,8 +334,8 @@ defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -410,20 +398,23 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -448,7 +439,7 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -456,11 +447,6 @@ defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; -// Vector Shift Operations -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : X86WriteResPairUnsupported; - // Vector insert/extract operations. defm : ZnWriteResFpuPair; @@ -623,15 +609,14 @@ def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { def : SchedAlias; def : SchedAlias; // TODO: is this right? def : SchedAlias; // TODO: is this right? -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // m16. def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { let Latency = 8; } def : SchedAlias; - +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // r32. def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { let Latency = 3; @@ -639,14 +624,14 @@ def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { def : SchedAlias; def : SchedAlias; // TODO: is this right? def : SchedAlias; // TODO: is this right? -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // m32. def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { let Latency = 8; } def : SchedAlias; +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // r64. def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { @@ -656,8 +641,6 @@ def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { def : SchedAlias; def : SchedAlias; // TODO: is this right? def : SchedAlias; // TODO: is this right? -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. -def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // m64. def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { @@ -665,6 +648,8 @@ def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { let NumMicroOps = 2; } def : SchedAlias; +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. +def : SchedAlias; // TODO: this is definitely wrong but matches what the instregex did. // MULX // Numbers are based on the AMD SOG for Family 17h - Instruction Latencies. @@ -1101,12 +1086,11 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; // HADD, HSUB PS/PD // PHADD|PHSUB (S) W/D. -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // PCMPGTQ. def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>; @@ -1446,12 +1430,6 @@ def : InstRW<[ZnWriteSHA256RNDS2Ld], (instrs SHA256RNDS2rm)>; //-- Arithmetic instructions --// -// HADD, HSUB PS/PD -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; - // VDIVPS. // TODO - convert to ZnWriteResFpuPair // y,y,y. diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td index 38908a987595..c47d235eab9b 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -195,7 +195,7 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : Zn2WriteResPair; +defm : Zn2WriteResPair; defm : Zn2WriteResPair; defm : Zn2WriteResPair; defm : Zn2WriteResPair; @@ -219,8 +219,8 @@ defm : X86WriteRes; defm : X86WriteRes; // Bit counts. -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; +defm : Zn2WriteResPair; +defm : Zn2WriteResPair; defm : Zn2WriteResPair; defm : Zn2WriteResPair; defm : Zn2WriteResPair; @@ -230,7 +230,7 @@ def : InstRW<[WriteMove], (instrs COPY)>; // BMI1 BEXTR, BMI2 BZHI defm : Zn2WriteResPair; -defm : Zn2WriteResPair; +defm : Zn2WriteResPair; // IDIV defm : Zn2WriteResPair; @@ -247,23 +247,17 @@ def Zn2WriteIMulH : WriteRes{ let Latency = 3; let NumMicroOps = 0; } - def : WriteRes{ let Latency = !add(Zn2WriteIMulH.Latency, Znver2Model.LoadLatency); let NumMicroOps = Zn2WriteIMulH.NumMicroOps; } - // Floating point operations defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; @@ -271,29 +265,34 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteResUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; @@ -332,8 +331,8 @@ defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; @@ -394,20 +393,23 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; @@ -440,11 +442,6 @@ defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; -// Vector Shift Operations -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; -defm : X86WriteResPairUnsupported; - // Vector insert/extract operations. defm : Zn2WriteResFpuPair; @@ -486,12 +483,6 @@ defm : Zn2WriteResFpuPair; def Zn2WriteMicrocoded : SchedWriteRes<[]> { let Latency = 100; } -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; -defm : Zn2WriteResPair; def : SchedAlias; def : SchedAlias; @@ -1109,6 +1100,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; //-- Arithmetic instructions --// +// HADD, HSUB PS/PD +// PHADD|PHSUB (S) W/D. +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; + // PCMPGTQ. def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>; def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; @@ -1479,6 +1478,7 @@ def : SchedAlias; // DPPS. // x,x,i / v,v,v,i. +defm : Zn2WriteResPair; def : SchedAlias; // x,m,i / v,v,m,i. diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 5e59081c63b0..78a286ae5b28 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -46,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, - SDValue Size, Align Alignment, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast(Size); const X86Subtarget &Subtarget = @@ -67,40 +67,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. if (Alignment < Align(4) || !ConstantSize || - ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { - // Check to see if there is a specialized entry-point for memory zeroing. - ConstantSDNode *ValC = dyn_cast(Val); - - if (const char *bzeroName = - (ValC && ValC->isZero()) - ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) - : nullptr) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Dst; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - Entry.Node = Size; - Args.push_back(Entry); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(Chain) - .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroName, IntPtr), - std::move(Args)) - .setDiscardResult(); - - std::pair CallResult = TLI.LowerCallTo(CLI); - return CallResult.second; - } - - // Otherwise have the target-independent code call memset. + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); - } uint64_t SizeVal = ConstantSize->getZExtValue(); SDValue InFlag; @@ -175,7 +143,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, dl, AddrVT)), Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, - isVolatile, false, DstPtrInfo.getWithOffset(Offset)); + isVolatile, AlwaysInline, + /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h index dac62973636c..19136ca4f6f5 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -29,7 +29,7 @@ public: SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVolatile, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index dba11e8b4000..3317db891cf0 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -181,17 +181,18 @@ private: void tracePredStateThroughBlocksAndHarden(MachineFunction &MF); unsigned saveEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertPt, DebugLoc Loc); + MachineBasicBlock::iterator InsertPt, + const DebugLoc &Loc); void restoreEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc, Register Reg); void mergePredStateIntoSP(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertPt, DebugLoc Loc, - unsigned PredStateReg); + MachineBasicBlock::iterator InsertPt, + const DebugLoc &Loc, unsigned PredStateReg); unsigned extractPredStateFromSP(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc); + const DebugLoc &Loc); void hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO, @@ -203,7 +204,7 @@ private: bool canHardenRegister(Register Reg); unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc); + const DebugLoc &Loc); unsigned hardenPostLoad(MachineInstr &MI); void hardenReturnInstr(MachineInstr &MI); void tracePredStateThroughCall(MachineInstr &MI); @@ -356,8 +357,8 @@ static void canonicalizePHIOperands(MachineFunction &MF) { int OpIdx = DupIndices.pop_back_val(); // Remove both the block and value operand, again in reverse order to // preserve indices. - MI.RemoveOperand(OpIdx + 1); - MI.RemoveOperand(OpIdx); + MI.removeOperand(OpIdx + 1); + MI.removeOperand(OpIdx); } Preds.clear(); @@ -1500,7 +1501,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden( /// as the save so that no PHI nodes are inserted. unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc) { + const DebugLoc &Loc) { // FIXME: Hard coding this to a 32-bit register class seems weird, but matches // what instruction selection does. Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass); @@ -1517,8 +1518,8 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( /// This must be done within the same basic block as the save in order to /// reliably lower. void X86SpeculativeLoadHardeningPass::restoreEFLAGS( - MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, - Register Reg) { + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + const DebugLoc &Loc, Register Reg) { BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg); ++NumInstsInserted; } @@ -1528,8 +1529,8 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS( /// a way that won't form non-canonical pointers and also will be preserved /// across normal stack adjustments. void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( - MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, - unsigned PredStateReg) { + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + const DebugLoc &Loc, unsigned PredStateReg) { Register TmpReg = MRI->createVirtualRegister(PS->RC); // FIXME: This hard codes a shift distance based on the number of bits needed // to stay canonical on 64-bit. We should compute this somehow and support @@ -1549,7 +1550,7 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( /// Extracts the predicate state stored in the high bits of the stack pointer. unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc) { + const DebugLoc &Loc) { Register PredStateReg = MRI->createVirtualRegister(PS->RC); Register TmpReg = MRI->createVirtualRegister(PS->RC); @@ -1907,7 +1908,7 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) { /// register class as `Reg`. unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc) { + const DebugLoc &Loc) { assert(canHardenRegister(Reg) && "Cannot harden this register!"); assert(Reg.isVirtual() && "Cannot harden a physical register!"); diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index a3d4d04b1e0d..0d091adc8e77 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -21,6 +21,8 @@ #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" @@ -247,7 +249,7 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, // the following check for Win32 should be removed. - if (In64BitMode || isTargetWin32()) + if (Is64Bit || isTargetWin32()) return false; return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } @@ -274,12 +276,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, // introduced with Intel's Nehalem/Silvermont and AMD's Family10h // micro-architectures respectively. if (hasSSE42() || hasSSE4A()) - IsUAMem16Slow = false; + IsUnalignedMem16Slow = false; LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel << ", 3DNowLevel " << X863DNowLevel << ", 64bit " << HasX86_64 << "\n"); - if (In64BitMode && !HasX86_64) + if (Is64Bit && !HasX86_64) report_fatal_error("64-bit code requested on a subtarget that doesn't " "support it!"); @@ -289,7 +291,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, if (StackAlignOverride) stackAlignment = *StackAlignOverride; else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || - isTargetNaCl() || In64BitMode) + isTargetNaCl() || Is64Bit) stackAlignment = Align(16); // Consume the vector width attribute or apply any target specific limit. @@ -357,7 +359,7 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { } bool X86Subtarget::enableEarlyIfConversion() const { - return hasCMov() && X86EarlyIfConv; + return canUseCMOV() && X86EarlyIfConv; } void X86Subtarget::getPostRAMutations( diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 5d773f0c57df..09a8b1f1aafb 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -50,24 +50,14 @@ enum class Style { } // end namespace PICStyles class X86Subtarget final : public X86GenSubtargetInfo { - // NOTE: Do not add anything new to this list. Coarse, CPU name based flags - // are not a good idea. We should be migrating away from these. - enum X86ProcFamilyEnum { - Others, - IntelAtom - }; - enum X86SSEEnum { - NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512 }; enum X863DNowEnum { NoThreeDNow, MMX, ThreeDNow, ThreeDNowA }; - /// X86 processor family: Intel Atom, and others - X86ProcFamilyEnum X86ProcFamily = Others; - /// Which PIC style to use PICStyles::Style PICStyle; @@ -79,412 +69,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// MMX, 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel = NoThreeDNow; - /// True if the processor supports X87 instructions. - bool HasX87 = false; - - /// True if the processor supports CMPXCHG8B. - bool HasCmpxchg8b = false; - - /// True if this processor has NOPL instruction - /// (generally pentium pro+). - bool HasNOPL = false; - - /// True if this processor has conditional move instructions - /// (generally pentium pro+). - bool HasCMov = false; - - /// True if the processor supports X86-64 instructions. - bool HasX86_64 = false; - - /// True if the processor supports POPCNT. - bool HasPOPCNT = false; - - /// True if the processor supports SSE4A instructions. - bool HasSSE4A = false; - - /// Target has AES instructions - bool HasAES = false; - bool HasVAES = false; - - /// Target has FXSAVE/FXRESTOR instructions - bool HasFXSR = false; - - /// Target has XSAVE instructions - bool HasXSAVE = false; - - /// Target has XSAVEOPT instructions - bool HasXSAVEOPT = false; - - /// Target has XSAVEC instructions - bool HasXSAVEC = false; - - /// Target has XSAVES instructions - bool HasXSAVES = false; - - /// Target has carry-less multiplication - bool HasPCLMUL = false; - bool HasVPCLMULQDQ = false; - - /// Target has Galois Field Arithmetic instructions - bool HasGFNI = false; - - /// Target has 3-operand fused multiply-add - bool HasFMA = false; - - /// Target has 4-operand fused multiply-add - bool HasFMA4 = false; - - /// Target has XOP instructions - bool HasXOP = false; - - /// Target has TBM instructions. - bool HasTBM = false; - - /// Target has LWP instructions - bool HasLWP = false; - - /// True if the processor has the MOVBE instruction. - bool HasMOVBE = false; - - /// True if the processor has the RDRAND instruction. - bool HasRDRAND = false; - - /// Processor has 16-bit floating point conversion instructions. - bool HasF16C = false; - - /// Processor has FS/GS base insturctions. - bool HasFSGSBase = false; - - /// Processor has LZCNT instruction. - bool HasLZCNT = false; - - /// Processor has BMI1 instructions. - bool HasBMI = false; - - /// Processor has BMI2 instructions. - bool HasBMI2 = false; - - /// Processor has VBMI instructions. - bool HasVBMI = false; - - /// Processor has VBMI2 instructions. - bool HasVBMI2 = false; - - /// Processor has Integer Fused Multiply Add - bool HasIFMA = false; - - /// Processor has RTM instructions. - bool HasRTM = false; - - /// Processor has ADX instructions. - bool HasADX = false; - - /// Processor has SHA instructions. - bool HasSHA = false; - - /// Processor has PRFCHW instructions. - bool HasPRFCHW = false; - - /// Processor has RDSEED instructions. - bool HasRDSEED = false; - - /// Processor has LAHF/SAHF instructions in 64-bit mode. - bool HasLAHFSAHF64 = false; - - /// Processor has MONITORX/MWAITX instructions. - bool HasMWAITX = false; - - /// Processor has Cache Line Zero instruction - bool HasCLZERO = false; - - /// Processor has Cache Line Demote instruction - bool HasCLDEMOTE = false; - - /// Processor has MOVDIRI instruction (direct store integer). - bool HasMOVDIRI = false; - - /// Processor has MOVDIR64B instruction (direct store 64 bytes). - bool HasMOVDIR64B = false; - - /// Processor has ptwrite instruction. - bool HasPTWRITE = false; - - /// Processor has Prefetch with intent to Write instruction - bool HasPREFETCHWT1 = false; - - /// True if SHLD instructions are slow. - bool IsSHLDSlow = false; - - /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and - // PMULUDQ. - bool IsPMULLDSlow = false; - - /// True if the PMADDWD instruction is slow compared to PMULLD. - bool IsPMADDWDSlow = false; - - /// True if unaligned memory accesses of 16-bytes are slow. - bool IsUAMem16Slow = false; - - /// True if unaligned memory accesses of 32-bytes are slow. - bool IsUAMem32Slow = false; - - /// True if SSE operations can have unaligned memory operands. - /// This may require setting a configuration bit in the processor. - bool HasSSEUnalignedMem = false; - - /// True if this processor has the CMPXCHG16B instruction; - /// this is true for most x86-64 chips, but not the first AMD chips. - bool HasCmpxchg16b = false; - - /// True if the LEA instruction should be used for adjusting - /// the stack pointer. This is an optimization for Intel Atom processors. - bool UseLeaForSP = false; - - /// True if POPCNT instruction has a false dependency on the destination register. - bool HasPOPCNTFalseDeps = false; - - /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. - bool HasLZCNTFalseDeps = false; - - /// True if its preferable to combine to a single cross-lane shuffle - /// using a variable mask over multiple fixed shuffles. - bool HasFastVariableCrossLaneShuffle = false; - - /// True if its preferable to combine to a single per-lane shuffle - /// using a variable mask over multiple fixed shuffles. - bool HasFastVariablePerLaneShuffle = false; - - /// True if vzeroupper instructions should be inserted after code that uses - /// ymm or zmm registers. - bool InsertVZEROUPPER = false; - - /// True if there is no performance penalty for writing NOPs with up to - /// 7 bytes. - bool HasFast7ByteNOP = false; - - /// True if there is no performance penalty for writing NOPs with up to - /// 11 bytes. - bool HasFast11ByteNOP = false; - - /// True if there is no performance penalty for writing NOPs with up to - /// 15 bytes. - bool HasFast15ByteNOP = false; - - /// True if gather is reasonably fast. This is true for Skylake client and - /// all AVX-512 CPUs. - bool HasFastGather = false; - - /// True if hardware SQRTSS instruction is at least as fast (latency) as - /// RSQRTSS followed by a Newton-Raphson iteration. - bool HasFastScalarFSQRT = false; - - /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast - /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. - bool HasFastVectorFSQRT = false; - - /// True if 8-bit divisions are significantly faster than - /// 32-bit divisions and should be used when possible. - bool HasSlowDivide32 = false; - - /// True if 32-bit divides are significantly faster than - /// 64-bit divisions and should be used when possible. - bool HasSlowDivide64 = false; - - /// True if LZCNT instruction is fast. - bool HasFastLZCNT = false; - - /// True if SHLD based rotate is fast. - bool HasFastSHLDRotate = false; - - /// True if the processor supports macrofusion. - bool HasMacroFusion = false; - - /// True if the processor supports branch fusion. - bool HasBranchFusion = false; - - /// True if the processor has enhanced REP MOVSB/STOSB. - bool HasERMSB = false; - - /// True if the processor has fast short REP MOV. - bool HasFSRM = false; - - /// True if the short functions should be padded to prevent - /// a stall when returning too early. - bool PadShortFunctions = false; - - /// True if two memory operand instructions should use a temporary register - /// instead. - bool SlowTwoMemOps = false; - - /// True if the LEA instruction inputs have to be ready at address generation - /// (AG) time. - bool LEAUsesAG = false; - - /// True if the LEA instruction with certain arguments is slow - bool SlowLEA = false; - - /// True if the LEA instruction has all three source operands: base, index, - /// and offset or if the LEA instruction uses base and index registers where - /// the base is EBP, RBP,or R13 - bool Slow3OpsLEA = false; - - /// True if INC and DEC instructions are slow when writing to flags - bool SlowIncDec = false; - - /// Processor has AVX-512 PreFetch Instructions - bool HasPFI = false; - - /// Processor has AVX-512 Exponential and Reciprocal Instructions - bool HasERI = false; - - /// Processor has AVX-512 Conflict Detection Instructions - bool HasCDI = false; - - /// Processor has AVX-512 population count Instructions - bool HasVPOPCNTDQ = false; - - /// Processor has AVX-512 Doubleword and Quadword instructions - bool HasDQI = false; - - /// Processor has AVX-512 Byte and Word instructions - bool HasBWI = false; - - /// Processor has AVX-512 Vector Length eXtenstions - bool HasVLX = false; - - /// Processor has AVX-512 16 bit floating-point extenstions - bool HasFP16 = false; - - /// Processor has PKU extenstions - bool HasPKU = false; - - /// Processor has AVX-512 Vector Neural Network Instructions - bool HasVNNI = false; - - /// Processor has AVX Vector Neural Network Instructions - bool HasAVXVNNI = false; - - /// Processor has AVX-512 bfloat16 floating-point extensions - bool HasBF16 = false; - - /// Processor supports ENQCMD instructions - bool HasENQCMD = false; - - /// Processor has AVX-512 Bit Algorithms instructions - bool HasBITALG = false; - - /// Processor has AVX-512 vp2intersect instructions - bool HasVP2INTERSECT = false; - - /// Processor supports CET SHSTK - Control-Flow Enforcement Technology - /// using Shadow Stack - bool HasSHSTK = false; - - /// Processor supports Invalidate Process-Context Identifier - bool HasINVPCID = false; - - /// Processor has Software Guard Extensions - bool HasSGX = false; - - /// Processor supports Flush Cache Line instruction - bool HasCLFLUSHOPT = false; - - /// Processor supports Cache Line Write Back instruction - bool HasCLWB = false; - - /// Processor supports Write Back No Invalidate instruction - bool HasWBNOINVD = false; - - /// Processor support RDPID instruction - bool HasRDPID = false; - - /// Processor supports WaitPKG instructions - bool HasWAITPKG = false; - - /// Processor supports PCONFIG instruction - bool HasPCONFIG = false; - - /// Processor support key locker instructions - bool HasKL = false; - - /// Processor support key locker wide instructions - bool HasWIDEKL = false; - - /// Processor supports HRESET instruction - bool HasHRESET = false; - - /// Processor supports SERIALIZE instruction - bool HasSERIALIZE = false; - - /// Processor supports TSXLDTRK instruction - bool HasTSXLDTRK = false; - - /// Processor has AMX support - bool HasAMXTILE = false; - bool HasAMXBF16 = false; - bool HasAMXINT8 = false; - - /// Processor supports User Level Interrupt instructions - bool HasUINTR = false; - - /// Enable SSE4.2 CRC32 instruction (Used when SSE4.2 is supported but - /// function is GPR only) - bool HasCRC32 = false; - - /// Processor has a single uop BEXTR implementation. - bool HasFastBEXTR = false; - - /// Try harder to combine to horizontal vector ops if they are fast. - bool HasFastHorizontalOps = false; - - /// Prefer a left/right scalar logical shifts pair over a shift+and pair. - bool HasFastScalarShiftMasks = false; - - /// Prefer a left/right vector logical shifts pair over a shift+and pair. - bool HasFastVectorShiftMasks = false; - - /// Prefer a movbe over a single-use load + bswap / single-use bswap + store. - bool HasFastMOVBE = false; - - /// Use a retpoline thunk rather than indirect calls to block speculative - /// execution. - bool UseRetpolineIndirectCalls = false; - - /// Use a retpoline thunk or remove any indirect branch to block speculative - /// execution. - bool UseRetpolineIndirectBranches = false; - - /// Deprecated flag, query `UseRetpolineIndirectCalls` and - /// `UseRetpolineIndirectBranches` instead. - bool DeprecatedUseRetpoline = false; - - /// When using a retpoline thunk, call an externally provided thunk rather - /// than emitting one inside the compiler. - bool UseRetpolineExternalThunk = false; - - /// Prevent generation of indirect call/branch instructions from memory, - /// and force all indirect call/branch instructions from a register to be - /// preceded by an LFENCE. Also decompose RET instructions into a - /// POP+LFENCE+JMP sequence. - bool UseLVIControlFlowIntegrity = false; - - /// Enable Speculative Execution Side Effect Suppression - bool UseSpeculativeExecutionSideEffectSuppression = false; - - /// Insert LFENCE instructions to prevent data speculatively injected into - /// loads from being used maliciously. - bool UseLVILoadHardening = false; - - /// Use an instruction sequence for taking the address of a global that allows - /// a memory tag in the upper address bits. - bool AllowTaggedGlobals = false; - - /// Use software floating point for code generation. - bool UseSoftFloat = false; - - /// Use alias analysis during code generation. - bool UseAA = false; - +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool ATTRIBUTE = DEFAULT; +#include "X86GenSubtargetInfo.inc" /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. Align stackAlignment = Align(4); @@ -496,21 +83,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { // FIXME: this is a known good value for Yonah. How about others? unsigned MaxInlineSizeThreshold = 128; - /// Indicates target prefers 128 bit instructions. - bool Prefer128Bit = false; - - /// Indicates target prefers 256 bit instructions. - bool Prefer256Bit = false; - - /// Indicates target prefers AVX512 mask registers. - bool PreferMaskRegisters = false; - - /// Use Silvermont specific arithmetic costs. - bool UseSLMArithCosts = false; - - /// Use Goldmont specific floating point div/sqrt costs. - bool UseGLMDivSqrtCosts = false; - /// What processor and OS we're targeting. Triple TargetTriple; @@ -520,7 +92,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { std::unique_ptr RegBankInfo; std::unique_ptr InstSelector; -private: /// Override the stack alignment. MaybeAlign StackAlignOverride; @@ -534,15 +105,6 @@ private: /// Required vector width from function attribute. unsigned RequiredVectorWidth; - /// True if compiling for 64-bit, false for 16-bit or 32-bit. - bool In64BitMode = false; - - /// True if compiling for 32-bit, false for 16-bit or 64-bit. - bool In32BitMode = false; - - /// True if compiling for 16-bit, false for 32-bit or 64-bit. - bool In16BitMode = false; - X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which // X86TargetLowering needs. @@ -608,38 +170,32 @@ private: void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); public: - /// Is this x86_64? (disregarding specific ABI / programming model) - bool is64Bit() const { - return In64BitMode; - } - bool is32Bit() const { - return In32BitMode; - } - - bool is16Bit() const { - return In16BitMode; - } +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool GETTER() const { return ATTRIBUTE; } +#include "X86GenSubtargetInfo.inc" /// Is this x86_64 with the ILP32 programming model (x32 ABI)? bool isTarget64BitILP32() const { - return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl()); + return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl()); } /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? bool isTarget64BitLP64() const { - return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl()); + return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl()); } PICStyles::Style getPICStyle() const { return PICStyle; } void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } - bool hasX87() const { return HasX87; } - bool hasCmpxchg8b() const { return HasCmpxchg8b; } - bool hasNOPL() const { return HasNOPL; } + bool canUseCMPXCHG8B() const { return hasCX8(); } + bool canUseCMPXCHG16B() const { + // CX16 is just the CPUID bit, instruction requires 64-bit mode too. + return hasCX16() && is64Bit(); + } // SSE codegen depends on cmovs, and all SSE1+ processors support them. // All 64-bit processors support cmov. - bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); } + bool canUseCMOV() const { return hasCMOV() || hasSSE1() || is64Bit(); } bool hasSSE1() const { return X86SSELevel >= SSE1; } bool hasSSE2() const { return X86SSELevel >= SSE2; } bool hasSSE3() const { return X86SSELevel >= SSE3; } @@ -648,146 +204,26 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } - bool hasAVX512() const { return X86SSELevel >= AVX512F; } + bool hasAVX512() const { return X86SSELevel >= AVX512; } bool hasInt256() const { return hasAVX2(); } - bool hasSSE4A() const { return HasSSE4A; } bool hasMMX() const { return X863DNowLevel >= MMX; } - bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } - bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } - bool hasPOPCNT() const { return HasPOPCNT; } - bool hasAES() const { return HasAES; } - bool hasVAES() const { return HasVAES; } - bool hasFXSR() const { return HasFXSR; } - bool hasXSAVE() const { return HasXSAVE; } - bool hasXSAVEOPT() const { return HasXSAVEOPT; } - bool hasXSAVEC() const { return HasXSAVEC; } - bool hasXSAVES() const { return HasXSAVES; } - bool hasPCLMUL() const { return HasPCLMUL; } - bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } - bool hasGFNI() const { return HasGFNI; } - // Prefer FMA4 to FMA - its better for commutation/memory folding and - // has equal or better performance on all supported targets. - bool hasFMA() const { return HasFMA; } - bool hasFMA4() const { return HasFMA4; } + bool hasThreeDNow() const { return X863DNowLevel >= ThreeDNow; } + bool hasThreeDNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } - bool hasXOP() const { return HasXOP; } - bool hasTBM() const { return HasTBM; } - bool hasLWP() const { return HasLWP; } - bool hasMOVBE() const { return HasMOVBE; } - bool hasRDRAND() const { return HasRDRAND; } - bool hasF16C() const { return HasF16C; } - bool hasFSGSBase() const { return HasFSGSBase; } - bool hasLZCNT() const { return HasLZCNT; } - bool hasBMI() const { return HasBMI; } - bool hasBMI2() const { return HasBMI2; } - bool hasVBMI() const { return HasVBMI; } - bool hasVBMI2() const { return HasVBMI2; } - bool hasIFMA() const { return HasIFMA; } - bool hasRTM() const { return HasRTM; } - bool hasADX() const { return HasADX; } - bool hasSHA() const { return HasSHA; } - bool hasPRFCHW() const { return HasPRFCHW; } - bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } bool hasPrefetchW() const { // The PREFETCHW instruction was added with 3DNow but later CPUs gave it // its own CPUID bit as part of deprecating 3DNow. Intel eventually added // it and KNL has another that prefetches to L2 cache. We assume the // L1 version exists if the L2 version does. - return has3DNow() || hasPRFCHW() || hasPREFETCHWT1(); + return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1(); } bool hasSSEPrefetch() const { // We implicitly enable these when we have a write prefix supporting cache // level OR if we have prfchw, but don't already have a read prefetch from // 3dnow. - return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); - } - bool hasRDSEED() const { return HasRDSEED; } - bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } - bool hasMWAITX() const { return HasMWAITX; } - bool hasCLZERO() const { return HasCLZERO; } - bool hasCLDEMOTE() const { return HasCLDEMOTE; } - bool hasMOVDIRI() const { return HasMOVDIRI; } - bool hasMOVDIR64B() const { return HasMOVDIR64B; } - bool hasPTWRITE() const { return HasPTWRITE; } - bool isSHLDSlow() const { return IsSHLDSlow; } - bool isPMULLDSlow() const { return IsPMULLDSlow; } - bool isPMADDWDSlow() const { return IsPMADDWDSlow; } - bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } - bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } - bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } - bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } - bool useLeaForSP() const { return UseLeaForSP; } - bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } - bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } - bool hasFastVariableCrossLaneShuffle() const { - return HasFastVariableCrossLaneShuffle; - } - bool hasFastVariablePerLaneShuffle() const { - return HasFastVariablePerLaneShuffle; + return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1(); } - bool insertVZEROUPPER() const { return InsertVZEROUPPER; } - bool hasFastGather() const { return HasFastGather; } - bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } - bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } - bool hasFastLZCNT() const { return HasFastLZCNT; } - bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } - bool hasFastBEXTR() const { return HasFastBEXTR; } - bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } - bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } - bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } - bool hasFastMOVBE() const { return HasFastMOVBE; } - bool hasMacroFusion() const { return HasMacroFusion; } - bool hasBranchFusion() const { return HasBranchFusion; } - bool hasERMSB() const { return HasERMSB; } - bool hasFSRM() const { return HasFSRM; } - bool hasSlowDivide32() const { return HasSlowDivide32; } - bool hasSlowDivide64() const { return HasSlowDivide64; } - bool padShortFunctions() const { return PadShortFunctions; } - bool slowTwoMemOps() const { return SlowTwoMemOps; } - bool LEAusesAG() const { return LEAUsesAG; } - bool slowLEA() const { return SlowLEA; } - bool slow3OpsLEA() const { return Slow3OpsLEA; } - bool slowIncDec() const { return SlowIncDec; } - bool hasCDI() const { return HasCDI; } - bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } - bool hasPFI() const { return HasPFI; } - bool hasERI() const { return HasERI; } - bool hasDQI() const { return HasDQI; } - bool hasBWI() const { return HasBWI; } - bool hasVLX() const { return HasVLX; } - bool hasFP16() const { return HasFP16; } - bool hasPKU() const { return HasPKU; } - bool hasVNNI() const { return HasVNNI; } - bool hasBF16() const { return HasBF16; } - bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } - bool hasBITALG() const { return HasBITALG; } - bool hasSHSTK() const { return HasSHSTK; } - bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } - bool hasCLWB() const { return HasCLWB; } - bool hasWBNOINVD() const { return HasWBNOINVD; } - bool hasRDPID() const { return HasRDPID; } - bool hasWAITPKG() const { return HasWAITPKG; } - bool hasPCONFIG() const { return HasPCONFIG; } - bool hasSGX() const { return HasSGX; } - bool hasINVPCID() const { return HasINVPCID; } - bool hasENQCMD() const { return HasENQCMD; } - bool hasKL() const { return HasKL; } - bool hasWIDEKL() const { return HasWIDEKL; } - bool hasHRESET() const { return HasHRESET; } - bool hasSERIALIZE() const { return HasSERIALIZE; } - bool hasTSXLDTRK() const { return HasTSXLDTRK; } - bool hasUINTR() const { return HasUINTR; } - bool hasCRC32() const { return HasCRC32; } - bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } - bool useRetpolineIndirectBranches() const { - return UseRetpolineIndirectBranches; - } - bool hasAVXVNNI() const { return HasAVXVNNI; } - bool hasAMXTILE() const { return HasAMXTILE; } - bool hasAMXBF16() const { return HasAMXBF16; } - bool hasAMXINT8() const { return HasAMXINT8; } - bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } - + bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); } // These are generic getters that OR together all of the thunk types // supported by the subtarget. Therefore useIndirectThunk*() will return true // if any respective thunk feature is enabled. @@ -798,16 +234,6 @@ public: return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity(); } - bool preferMaskRegisters() const { return PreferMaskRegisters; } - bool useSLMArithCosts() const { return UseSLMArithCosts; } - bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } - bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } - bool allowTaggedGlobals() const { return AllowTaggedGlobals; } - bool useLVILoadHardening() const { return UseLVILoadHardening; } - bool useSpeculativeExecutionSideEffectSuppression() const { - return UseSpeculativeExecutionSideEffectSuppression; - } - unsigned getPreferVectorWidth() const { return PreferVectorWidth; } unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } @@ -834,11 +260,6 @@ public: bool isXRaySupported() const override { return is64Bit(); } - /// TODO: to be removed later and replaced with suitable properties - bool isAtom() const { return X86ProcFamily == IntelAtom; } - bool useSoftFloat() const { return UseSoftFloat; } - bool useAA() const override { return UseAA; } - /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for /// no-sse2). There isn't any reason to disable it if the target processor /// supports it. @@ -850,7 +271,7 @@ public: bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } - bool isTargetPS4() const { return TargetTriple.isPS4CPU(); } + bool isTargetPS() const { return TargetTriple.isPS(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } @@ -890,9 +311,9 @@ public: bool isOSWindows() const { return TargetTriple.isOSWindows(); } - bool isTargetWin64() const { return In64BitMode && isOSWindows(); } + bool isTargetWin64() const { return Is64Bit && isOSWindows(); } - bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } + bool isTargetWin32() const { return !Is64Bit && isOSWindows(); } bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; } bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; } @@ -990,8 +411,6 @@ public: AntiDepBreakMode getAntiDepBreakMode() const override { return TargetSubtargetInfo::ANTIDEP_CRITICAL; } - - bool enableAdvancedRASplitCost() const override { return false; } }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index e3d0128dd73d..4249788e3540 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -27,13 +27,16 @@ #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/ExecutionDomainFix.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" @@ -56,6 +59,11 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableTileRAPass("x86-tile-ra", + cl::desc("Enable the tile register allocation pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(getTheX86_32Target()); @@ -65,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86LowerAMXIntrinsicsLegacyPassPass(PR); initializeX86LowerAMXTypeLegacyPassPass(PR); initializeX86PreAMXConfigPassPass(PR); + initializeX86PreTileConfigPass(PR); initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); @@ -75,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); + initializeX86FastPreTileConfigPass(PR); initializeX86FastTileConfigPass(PR); initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); @@ -154,7 +164,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, bool JIT, Optional RM) { bool is64Bit = TT.getArch() == Triple::x86_64; - if (!RM.hasValue()) { + if (!RM) { // JIT codegen should use static relocations by default, since it's // typically executed in process and not relocatable. if (JIT) @@ -218,9 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL), TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) { - // On PS4, the "return address" of a 'noreturn' call must still be within + // On PS4/PS5, the "return address" of a 'noreturn' call must still be within // the calling function, and TrapUnreachable is an easy way to get that. - if (TT.isPS4() || TT.isOSBinFormatMachO()) { + if (TT.isPS() || TT.isOSBinFormatMachO()) { this->Options.TrapUnreachable = true; this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO(); } @@ -333,7 +343,7 @@ bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, //===----------------------------------------------------------------------===// TargetTransformInfo -X86TargetMachine::getTargetTransformInfo(const Function &F) { +X86TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(X86TTIImpl(this, F)); } @@ -382,7 +392,7 @@ public: void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; - bool addPreRewrite() override; + bool addRegAssignAndRewriteOptimized() override; std::unique_ptr getCSEConfig() const override; }; @@ -417,9 +427,6 @@ void X86PassConfig::addIRPasses() { addPass(createX86LowerAMXIntrinsicsPass()); addPass(createX86LowerAMXTypePass()); - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(createX86PreAMXConfigPass()); - TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOpt::None) { @@ -441,6 +448,9 @@ void X86PassConfig::addIRPasses() { addPass(createCFGuardCheckPass()); } } + + if (TM->Options.JMCInstrument) + addPass(createJMCInstrumenterPass()); } bool X86PassConfig::addInstSelector() { @@ -505,9 +515,10 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86FlagsCopyLoweringPass()); addPass(createX86DynAllocaExpander()); - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(createX86PreTileConfigPass()); - } + else + addPass(createX86FastPreTileConfigPass()); } void X86PassConfig::addMachineSSAOptimization() { @@ -607,11 +618,21 @@ bool X86PassConfig::addPostFastRegAllocRewrite() { return true; } -bool X86PassConfig::addPreRewrite() { - addPass(createX86TileConfigPass()); - return true; -} - std::unique_ptr X86PassConfig::getCSEConfig() const { return getStandardCSEConfigForOpt(TM->getOptLevel()); } + +static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isTileRegisterClass(&RC); +} + +bool X86PassConfig::addRegAssignAndRewriteOptimized() { + // Don't support tile RA when RA is specified by command line "-regalloc". + if (!isCustomizedRegAlloc() && EnableTileRAPass) { + // Allocate tile register first. + addPass(createGreedyRegisterAllocator(onlyAllocateTileRegisters)); + addPass(createX86TileConfigPass()); + } + return TargetPassConfig::addRegAssignAndRewriteOptimized(); +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index 69d7e48b8977..70df8da77641 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -44,7 +44,7 @@ public: // attributes of each function. const X86Subtarget *getSubtargetImpl() const = delete; - TargetTransformInfo getTargetTransformInfo(const Function &F) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 5b95c10332dc..b36f8a3d06d0 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef Mask, int Index, - VectorType *SubTp) { + VectorType *SubTp, + ArrayRef Args) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); @@ -1223,6 +1224,63 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), LegalVT.getVectorNumElements()); + if (!Mask.empty() && NumOfDests.isValid()) { + // Try to perform better estimation of the permutation. + // 1. Split the source/destination vectors into real registers. + // 2. Do the mask analysis to identify which real registers are + // permuted. If more than 1 source registers are used for the + // destination register building, the cost for this destination register + // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one + // source register is used, build mask and calculate the cost as a cost + // of PermuteSingleSrc. + // Also, for the single register permute we try to identify if the + // destination register is just a copy of the source register or the + // copy of the previous destination register (the cost is + // TTI::TCC_Basic). If the source register is just reused, the cost for + // this operation is 0. + unsigned E = *NumOfDests.getValue(); + unsigned NormalizedVF = + LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); + unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); + unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); + SmallVector NormalizedMask(NormalizedVF, UndefMaskElem); + copy(Mask, NormalizedMask.begin()); + unsigned PrevSrcReg = 0; + ArrayRef PrevRegMask; + InstructionCost Cost = 0; + processShuffleMasks( + NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, + [this, SingleOpTy, &PrevSrcReg, &PrevRegMask, + &Cost](ArrayRef RegMask, unsigned SrcReg, unsigned DestReg) { + if (!ShuffleVectorInst::isIdentityMask(RegMask)) { + // Check if the previous register can be just copied to the next + // one. + if (PrevRegMask.empty() || PrevSrcReg != SrcReg || + PrevRegMask != RegMask) + Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, + RegMask, 0, nullptr); + else + // Just a copy of previous destination register. + Cost += TTI::TCC_Basic; + return; + } + if (SrcReg != DestReg && + any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { + // Just a copy of the source register. + Cost += TTI::TCC_Basic; + } + PrevSrcReg = SrcReg; + PrevRegMask = RegMask; + }, + [this, SingleOpTy, &Cost](ArrayRef RegMask, + unsigned /*Unused*/, + unsigned /*Unused*/) { + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, + 0, nullptr); + }); + return Cost; + } + InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, None, 0, nullptr); @@ -1545,9 +1603,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; - if (ST->hasSSE2()) + static const CostTblEntry SSE3BroadcastLoadTbl[] = { + {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup + }; + + if (ST->hasSSE2()) { + bool IsLoad = + llvm::any_of(Args, [](const auto &V) { return isa(V); }); + if (ST->hasSSE3() && IsLoad) + if (const auto *Entry = + CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { + assert(isLegalBroadcastLoad(BaseTp->getElementType(), + LT.second.getVectorElementCount()) && + "Table entry missing from isLegalBroadcastLoad()"); + return LT.first * Entry->Cost; + } + if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; + } static const CostTblEntry SSE1ShuffleTbl[] = { { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps @@ -2444,6 +2518,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, std::pair LTDest = TLI->getTypeLegalizationCost(DL, Dst); + // If we're truncating to the same legalized type - just assume its free. + if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) + return TTI::TCC_Free; + if (ST->useAVX512Regs()) { if (ST->hasBWI()) if (const auto *Entry = ConvertCostTableLookup( @@ -2545,7 +2623,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - unsigned ExtraCost = 0; + InstructionCost ExtraCost = 0; if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { // Some vector comparison predicates cost extra instructions. // TODO: Should we invert this and assume worst case cmp costs @@ -2619,15 +2697,29 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16f32, 1 }, { ISD::SELECT, MVT::v8i64, 1 }, + { ISD::SELECT, MVT::v4i64, 1 }, + { ISD::SELECT, MVT::v2i64, 1 }, { ISD::SELECT, MVT::v16i32, 1 }, + { ISD::SELECT, MVT::v8i32, 1 }, + { ISD::SELECT, MVT::v4i32, 1 }, { ISD::SELECT, MVT::v8f64, 1 }, + { ISD::SELECT, MVT::v4f64, 1 }, + { ISD::SELECT, MVT::v2f64, 1 }, + { ISD::SELECT, MVT::f64, 1 }, { ISD::SELECT, MVT::v16f32, 1 }, + { ISD::SELECT, MVT::v8f32 , 1 }, + { ISD::SELECT, MVT::v4f32, 1 }, + { ISD::SELECT, MVT::f32 , 1 }, { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 - { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 - { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 + { ISD::SELECT, MVT::v32i16, 2 }, + { ISD::SELECT, MVT::v16i16, 1 }, + { ISD::SELECT, MVT::v8i16, 1 }, + { ISD::SELECT, MVT::v64i8, 2 }, + { ISD::SELECT, MVT::v32i8, 1 }, + { ISD::SELECT, MVT::v16i8, 1 }, }; static const CostTblEntry AVX2CostTbl[] = { @@ -2636,10 +2728,12 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i16, 1 }, { ISD::SETCC, MVT::v32i8, 1 }, - { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb - { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb - { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb - { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb + { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb + { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb + { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb + { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb }; static const CostTblEntry AVX1CostTbl[] = { @@ -2651,49 +2745,54 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i16, 4 }, { ISD::SETCC, MVT::v32i8, 4 }, - { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd - { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps - { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd - { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps + { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd + { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps }; static const CostTblEntry SSE42CostTbl[] = { - { ISD::SETCC, MVT::v2f64, 1 }, - { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, }; static const CostTblEntry SSE41CostTbl[] = { - { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd - { ISD::SELECT, MVT::v4f32, 1 }, // blendvps - { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb - { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb - { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb - { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + + { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd + { ISD::SELECT, MVT::f64, 2 }, // blendvpd + { ISD::SELECT, MVT::v4f32, 2 }, // blendvps + { ISD::SELECT, MVT::f32 , 2 }, // blendvps + { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb + { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb + { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb + { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb }; static const CostTblEntry SSE2CostTbl[] = { { ISD::SETCC, MVT::v2f64, 2 }, { ISD::SETCC, MVT::f64, 1 }, - { ISD::SETCC, MVT::v2i64, 8 }, + { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion { ISD::SETCC, MVT::v4i32, 1 }, { ISD::SETCC, MVT::v8i16, 1 }, { ISD::SETCC, MVT::v16i8, 1 }, - { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd - { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por - { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por + { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por }; static const CostTblEntry SSE1CostTbl[] = { { ISD::SETCC, MVT::v4f32, 2 }, { ISD::SETCC, MVT::f32, 1 }, - { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps + { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps + { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps }; if (ST->useSLMArithCosts()) @@ -3555,7 +3654,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); - int RegisterFileMoveCost = 0; + InstructionCost RegisterFileMoveCost = 0; // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. @@ -3589,6 +3688,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Index != -1U && (Opcode == Instruction::ExtractElement || Opcode == Instruction::InsertElement)) { + // Extraction of vXi1 elements are now efficiently handled by MOVMSK. + if (Opcode == Instruction::ExtractElement && + ScalarType->getScalarSizeInBits() == 1 && + cast(Val)->getNumElements() > 1) + return 1; + // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Val); @@ -3597,15 +3702,16 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return 0; // The type may be split. Normalize the index to the new type. + unsigned SizeInBits = LT.second.getSizeInBits(); unsigned NumElts = LT.second.getVectorNumElements(); unsigned SubNumElts = NumElts; Index = Index % NumElts; // For >128-bit vectors, we need to extract higher 128-bit subvectors. // For inserts, we also need to insert the subvector back. - if (LT.second.getSizeInBits() > 128) { - assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); - unsigned NumSubVecs = LT.second.getSizeInBits() / 128; + if (SizeInBits > 128) { + assert((SizeInBits % 128) == 0 && "Illegal vector"); + unsigned NumSubVecs = SizeInBits / 128; SubNumElts = NumElts / NumSubVecs; if (SubNumElts <= Index) { RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); @@ -3673,20 +3779,25 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { + assert(DemandedElts.getBitWidth() == + cast(Ty)->getNumElements() && + "Vector size mismatch"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + MVT MScalarTy = LT.second.getScalarType(); + unsigned SizeInBits = LT.second.getSizeInBits(); + InstructionCost Cost = 0; // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. if (Insert) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); - MVT MScalarTy = LT.second.getScalarType(); - if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || (MScalarTy.isInteger() && ST->hasSSE41()) || (MScalarTy == MVT::f32 && ST->hasSSE41())) { // For types we can insert directly, insertion into 128-bit sub vectors is // cheap, followed by a cheap chain of concatenations. - if (LT.second.getSizeInBits() <= 128) { + if (SizeInBits <= 128) { Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); } else { @@ -3704,9 +3815,9 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. const int CostValue = *LT.first.getValue(); assert(CostValue >= 0 && "Negative cost!"); - unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; + unsigned Num128Lanes = SizeInBits / 128 * CostValue; unsigned NumElts = LT.second.getVectorNumElements() * CostValue; - APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); + APInt WidenedDemandedElts = DemandedElts.zext(NumElts); unsigned Scale = NumElts / Num128Lanes; // We iterate each 128-lane, and check if we need a // extracti128/inserti128 for this 128-lane. @@ -3747,10 +3858,59 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, } } - // TODO: Use default extraction for now, but we should investigate extending this - // to handle repeated subvector extraction. - if (Extract) + if (Extract) { + // vXi1 can be efficiently extracted with MOVMSK. + // TODO: AVX512 predicate mask handling. + // NOTE: This doesn't work well for roundtrip scalarization. + if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { + unsigned NumElts = cast(Ty)->getNumElements(); + unsigned MaxElts = ST->hasAVX2() ? 32 : 16; + unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; + return MOVMSKCost; + } + + if (LT.second.isVector()) { + int CostValue = *LT.first.getValue(); + assert(CostValue >= 0 && "Negative cost!"); + + unsigned NumElts = LT.second.getVectorNumElements() * CostValue; + assert(NumElts >= DemandedElts.getBitWidth() && + "Vector has been legalized to smaller element count"); + + // If we're extracting elements from a 128-bit subvector lane, we only need + // to extract each lane once, not for every element. + if (SizeInBits > 128) { + assert((SizeInBits % 128) == 0 && "Illegal vector"); + unsigned NumLegal128Lanes = SizeInBits / 128; + unsigned Num128Lanes = NumLegal128Lanes * CostValue; + APInt WidenedDemandedElts = DemandedElts.zext(NumElts); + unsigned Scale = NumElts / Num128Lanes; + + // Add cost for each demanded 128-bit subvector extraction. + // Luckily this is a lot easier than for insertion. + APInt DemandedUpper128Lanes = + APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes); + auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale); + for (unsigned I = 0; I != Num128Lanes; ++I) + if (DemandedUpper128Lanes[I]) + Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + I * Scale, Ty128); + + // Add all the demanded element extractions together, but adjust the + // index to use the equivalent of the bottom 128 bit lane. + for (unsigned I = 0; I != NumElts; ++I) + if (WidenedDemandedElts[I]) { + unsigned Idx = I % Scale; + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx); + } + + return Cost; + } + } + + // Fallback to default extraction. Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + } return Cost; } @@ -3855,8 +4015,7 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, // if all elements that will form a single Dst vector aren't demanded, // then we won't need to do that shuffle, so adjust the cost accordingly. APInt DemandedDstVectors = APIntOps::ScaleBitMask( - DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), - NumDstVectors); + DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); InstructionCost SingleShuffleCost = @@ -5029,8 +5188,8 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } -bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, - TargetTransformInfo::LSRCost &C2) { +bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) { // X86 specific here are "instruction number 1st priority". return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, @@ -5110,6 +5269,14 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { return true; } +bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, + ElementCount NumElements) const { + // movddup + return ST->hasSSE3() && !NumElements.isScalable() && + NumElements.getFixedValue() == 2 && + ElementTy == Type::getDoubleTy(ElementTy->getContext()); +} + bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { if (!isa(DataTy)) return false; @@ -5174,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { return IntWidth == 32 || IntWidth == 64; } +bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, + const SmallBitVector &OpcodeMask) const { + // ADDSUBPS 4xf32 SSE3 + // VADDSUBPS 4xf32 AVX + // VADDSUBPS 8xf32 AVX2 + // ADDSUBPD 2xf64 SSE3 + // VADDSUBPD 2xf64 AVX + // VADDSUBPD 4xf64 AVX2 + + unsigned NumElements = cast(VecTy)->getNumElements(); + assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible"); + if (!isPowerOf2_32(NumElements)) + return false; + // Check the opcode pattern. We apply the mask on the opcode arguments and + // then check if it is what we expect. + for (int Lane : seq(0, NumElements)) { + unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; + // We expect FSub for even lanes and FAdd for odd lanes. + if (Lane % 2 == 0 && Opc != Instruction::FSub) + return false; + if (Lane % 2 == 1 && Opc != Instruction::FAdd) + return false; + } + // Now check that the pattern is supported by the target ISA. + Type *ElemTy = cast(VecTy)->getElementType(); + if (ElemTy->isFloatTy()) + return ST->hasSSE3() && NumElements % 4 == 0; + if (ElemTy->isDoubleTy()) + return ST->hasSSE3() && NumElements % 2 == 0; + return false; +} + bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { // AVX2 doesn't support scatter if (!ST->hasAVX512()) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 69715072426f..bd3c3fb1bb2f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -38,12 +38,12 @@ class X86TTIImpl : public BasicTTIImplBase { const FeatureBitset InlineFeatureIgnoreList = { // This indicates the CPU is 64 bit capable not that we are in 64-bit // mode. - X86::Feature64Bit, + X86::FeatureX86_64, // These features don't have any intrinsics or ABI effect. X86::FeatureNOPL, - X86::FeatureCMPXCHG16B, - X86::FeatureLAHFSAHF, + X86::FeatureCX16, + X86::FeatureLAHFSAHF64, // Some older targets can be setup to fold unaligned loads. X86::FeatureSSEUnalignedMem, @@ -68,6 +68,11 @@ class X86TTIImpl : public BasicTTIImplBase { X86::TuningMacroFusion, X86::TuningPadShortFunctions, X86::TuningPOPCNTFalseDeps, + X86::TuningMULCFalseDeps, + X86::TuningPERMFalseDeps, + X86::TuningRANGEFalseDeps, + X86::TuningGETMANTFalseDeps, + X86::TuningMULLQFalseDeps, X86::TuningSlow3OpsLEA, X86::TuningSlowDivide32, X86::TuningSlowDivide64, @@ -131,7 +136,8 @@ public: const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp); + VectorType *SubTp, + ArrayRef Args = None); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -219,13 +225,14 @@ public: InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); - bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, - TargetTransformInfo::LSRCost &C2); + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); bool isLegalMaskedLoad(Type *DataType, Align Alignment); bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); + bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const; bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment); bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { return forceScalarizeMaskedGather(VTy, Alignment); @@ -234,6 +241,8 @@ public: bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); bool isLegalMaskedCompressStore(Type *DataType); + bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, + const SmallBitVector &OpcodeMask) const; bool hasDivRemOp(Type *DataType, bool IsSigned); bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 8114a0b2d423..5cada924e006 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -36,7 +36,7 @@ using namespace llvm; -#define DEBUG_TYPE "tile-config" +#define DEBUG_TYPE "tileconfig" namespace { @@ -70,11 +70,11 @@ struct X86TileConfig : public MachineFunctionPass { char X86TileConfig::ID = 0; -INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", +INITIALIZE_PASS_BEGIN(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", - false, false) +INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, + false) bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); @@ -90,7 +90,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { int SS = INT_MAX; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == X86::LDTILECFG) { + if (MI.getOpcode() == X86::PLDTILECFGV) { SS = MI.getOperand(0).getIndex(); break; } @@ -98,6 +98,9 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { if (SS != INT_MAX) break; } + // Didn't find PLDTILECFGV, just return false; + if (SS == INT_MAX) + return false; // Try to find a point to insert MIs for constant shapes. // Here we are leveraging the palette id inserted in PreRA pass. @@ -120,6 +123,8 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { continue; if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID) continue; + if (VRM.getPhys(VirtReg) == VirtRegMap::NO_PHYS_REG) + continue; unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0; if (!Phys2Virt[Index]) Phys2Virt[Index] = VirtReg; -- cgit v1.2.3