aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp513
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86Operand.h36
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp77
-rw-r--r--llvm/lib/Target/X86/MCA/X86CustomBehaviour.h2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp105
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp22
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h3
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp165
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h54
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp4
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp134
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h1
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp91
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h26
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp16
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp21
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp1
-rw-r--r--llvm/lib/Target/X86/X86.h4
-rw-r--r--llvm/lib/Target/X86/X86.td279
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp96
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h5
-rw-r--r--llvm/lib/Target/X86/X86AvoidTrailingCall.cpp7
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.cpp2
-rw-r--r--llvm/lib/Target/X86/X86CmovConversion.cpp27
-rw-r--r--llvm/lib/Target/X86/X86DiscriminateMemOps.cpp3
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp14
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp11
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp133
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp709
-rw-r--r--llvm/lib/Target/X86/X86FastTileConfig.cpp293
-rw-r--r--llvm/lib/Target/X86/X86FixupLEAs.cpp3
-rw-r--r--llvm/lib/Target/X86/X86FloatingPoint.cpp26
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp136
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.h7
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp282
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp3225
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h58
-rw-r--r--llvm/lib/Target/X86/X86IndirectThunks.cpp1
-rw-r--r--llvm/lib/Target/X86/X86InsertPrefetch.cpp1
-rw-r--r--llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp49
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td18
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td131
-rw-r--r--llvm/lib/Target/X86/X86InstrArithmetic.td8
-rw-r--r--llvm/lib/Target/X86/X86InstrCMovSetCC.td8
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td85
-rw-r--r--llvm/lib/Target/X86/X86InstrControl.td4
-rw-r--r--llvm/lib/Target/X86/X86InstrFPStack.td22
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp4
-rw-r--r--llvm/lib/Target/X86/X86InstrFormats.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td1
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp851
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h18
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td111
-rw-r--r--llvm/lib/Target/X86/X86InstrMMX.td4
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td68
-rw-r--r--llvm/lib/Target/X86/X86InstrSystem.td16
-rw-r--r--llvm/lib/Target/X86/X86InstrTSX.td2
-rw-r--r--llvm/lib/Target/X86/X86InstrVecCompiler.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrXOP.td4
-rw-r--r--llvm/lib/Target/X86/X86InstructionSelector.cpp16
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h12
-rw-r--r--llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp3
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp1
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp181
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp41
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.cpp7
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.h10
-rw-r--r--llvm/lib/Target/X86/X86MacroFusion.cpp1
-rw-r--r--llvm/lib/Target/X86/X86PadShortFunction.cpp11
-rw-r--r--llvm/lib/Target/X86/X86PartialReduction.cpp35
-rw-r--r--llvm/lib/Target/X86/X86PreAMXConfig.cpp56
-rw-r--r--llvm/lib/Target/X86/X86PreTileConfig.cpp53
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.cpp7
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.h2
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp62
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.h12
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td15
-rw-r--r--llvm/lib/Target/X86/X86SchedBroadwell.td20
-rw-r--r--llvm/lib/Target/X86/X86SchedHaswell.td20
-rw-r--r--llvm/lib/Target/X86/X86SchedIceLake.td20
-rw-r--r--llvm/lib/Target/X86/X86SchedSandyBridge.td40
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeClient.td26
-rw-r--r--llvm/lib/Target/X86/X86SchedSkylakeServer.td32
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBtVer2.td4
-rw-r--r--llvm/lib/Target/X86/X86ScheduleSLM.td6
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver1.td106
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver2.td86
-rw-r--r--llvm/lib/Target/X86/X86SelectionDAGInfo.cpp39
-rw-r--r--llvm/lib/Target/X86/X86SelectionDAGInfo.h2
-rw-r--r--llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp31
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp12
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h629
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp51
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.h2
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp290
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h21
-rw-r--r--llvm/lib/Target/X86/X86TileConfig.cpp15
98 files changed, 6078 insertions, 3916 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e9ecff3bf514..871b23f80efe 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86MCExpr.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "MCTargetDesc/X86TargetStreamer.h"
#include "TargetInfo/X86TargetInfo.h"
#include "X86AsmParserCommon.h"
@@ -124,12 +125,12 @@ private:
bool matchingInlineAsm, unsigned VariantID = 0) {
// In Code16GCC mode, match as 32-bit.
if (Code16GCC)
- SwitchMode(X86::Mode32Bit);
+ SwitchMode(X86::Is32Bit);
unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
MissingFeatures, matchingInlineAsm,
VariantID);
if (Code16GCC)
- SwitchMode(X86::Mode16Bit);
+ SwitchMode(X86::Is16Bit);
return rv;
}
@@ -422,16 +423,18 @@ private:
};
class IntelExprStateMachine {
- IntelExprState State, PrevState;
- unsigned BaseReg, IndexReg, TmpReg, Scale;
- int64_t Imm;
- const MCExpr *Sym;
+ IntelExprState State = IES_INIT, PrevState = IES_ERROR;
+ unsigned BaseReg = 0, IndexReg = 0, TmpReg = 0, Scale = 0;
+ int64_t Imm = 0;
+ const MCExpr *Sym = nullptr;
StringRef SymName;
InfixCalculator IC;
InlineAsmIdentifierInfo Info;
- short BracCount;
- bool MemExpr;
- bool OffsetOperator;
+ short BracCount = 0;
+ bool MemExpr = false;
+ bool OffsetOperator = false;
+ bool AttachToOperandIdx = false;
+ bool IsPIC = false;
SMLoc OffsetOperatorLoc;
AsmTypeInfo CurType;
@@ -446,10 +449,7 @@ private:
}
public:
- IntelExprStateMachine()
- : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
- TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
- MemExpr(false), OffsetOperator(false) {}
+ IntelExprStateMachine() = default;
void addImm(int64_t imm) { Imm += imm; }
short getBracCount() const { return BracCount; }
@@ -469,9 +469,29 @@ private:
bool isValidEndState() const {
return State == IES_RBRAC || State == IES_INTEGER;
}
+
+ // Is the intel expression appended after an operand index.
+ // [OperandIdx][Intel Expression]
+ // This is neccessary for checking if it is an independent
+ // intel expression at back end when parse inline asm.
+ void setAppendAfterOperand() { AttachToOperandIdx = true; }
+
+ bool isPIC() const { return IsPIC; }
+ void setPIC() { IsPIC = true; }
+
bool hadError() const { return State == IES_ERROR; }
const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
+ bool regsUseUpError(StringRef &ErrMsg) {
+ // This case mostly happen in inline asm, e.g. Arr[BaseReg + IndexReg]
+ // can not intruduce additional register in inline asm in PIC model.
+ if (IsPIC && AttachToOperandIdx)
+ ErrMsg = "Don't use 2 or more regs for mem offset in PIC model!";
+ else
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
+
void onOr() {
IntelExprState CurrState = State;
switch (State) {
@@ -655,10 +675,8 @@ private:
if (!BaseReg) {
BaseReg = TmpReg;
} else {
- if (IndexReg) {
- ErrMsg = "BaseReg/IndexReg already set!";
- return true;
- }
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
IndexReg = TmpReg;
Scale = 0;
}
@@ -716,10 +734,8 @@ private:
if (!BaseReg) {
BaseReg = TmpReg;
} else {
- if (IndexReg) {
- ErrMsg = "BaseReg/IndexReg already set!";
- return true;
- }
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
IndexReg = TmpReg;
Scale = 0;
}
@@ -777,10 +793,8 @@ private:
case IES_MULTIPLY:
// Index Register - Scale * Register
if (PrevState == IES_INTEGER) {
- if (IndexReg) {
- ErrMsg = "BaseReg/IndexReg already set!";
- return true;
- }
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
State = IES_REGISTER;
IndexReg = Reg;
// Get the scale and replace the 'Scale * Register' with '0'.
@@ -861,10 +875,8 @@ private:
State = IES_INTEGER;
if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
// Index Register - Register * Scale
- if (IndexReg) {
- ErrMsg = "BaseReg/IndexReg already set!";
- return true;
- }
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
IndexReg = TmpReg;
Scale = TmpInt;
if (checkScale(Scale, ErrMsg))
@@ -945,7 +957,7 @@ private:
BracCount++;
return false;
}
- bool onRBrac() {
+ bool onRBrac(StringRef &ErrMsg) {
IntelExprState CurrState = State;
switch (State) {
default:
@@ -955,8 +967,10 @@ private:
case IES_OFFSET:
case IES_REGISTER:
case IES_RPAREN:
- if (BracCount-- != 1)
+ if (BracCount-- != 1) {
+ ErrMsg = "unexpected bracket encountered";
return true;
+ }
State = IES_RBRAC;
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
@@ -964,7 +978,8 @@ private:
if (!BaseReg) {
BaseReg = TmpReg;
} else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
IndexReg = TmpReg;
Scale = 0;
}
@@ -1089,9 +1104,9 @@ private:
std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
OperandVector &FinalOperands);
- bool ParseOperand(OperandVector &Operands);
- bool ParseATTOperand(OperandVector &Operands);
- bool ParseIntelOperand(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands, StringRef Name);
+ bool parseATTOperand(OperandVector &Operands);
+ bool parseIntelOperand(OperandVector &Operands, StringRef Name);
bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
InlineAsmIdentifierInfo &Info, SMLoc &End);
bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
@@ -1111,6 +1126,8 @@ private:
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End,
bool IsParsingOffsetOperator = false);
+ void tryParseOperandIdx(AsmToken::TokenKind PrevTK,
+ IntelExprStateMachine &SM);
bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
SMLoc EndLoc, OperandVector &Operands);
@@ -1193,19 +1210,19 @@ private:
bool is64BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return getSTI().getFeatureBits()[X86::Mode64Bit];
+ return getSTI().getFeatureBits()[X86::Is64Bit];
}
bool is32BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return getSTI().getFeatureBits()[X86::Mode32Bit];
+ return getSTI().getFeatureBits()[X86::Is32Bit];
}
bool is16BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return getSTI().getFeatureBits()[X86::Mode16Bit];
+ return getSTI().getFeatureBits()[X86::Is16Bit];
}
void SwitchMode(unsigned mode) {
MCSubtargetInfo &STI = copySTI();
- FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+ FeatureBitset AllModes({X86::Is64Bit, X86::Is32Bit, X86::Is16Bit});
FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
FeatureBitset FB = ComputeAvailableFeatures(
STI.ToggleFeature(OldMode.flip(mode)));
@@ -1716,11 +1733,11 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
return false;
}
-bool X86AsmParser::ParseOperand(OperandVector &Operands) {
+bool X86AsmParser::parseOperand(OperandVector &Operands, StringRef Name) {
if (isParsingIntelSyntax())
- return ParseIntelOperand(Operands);
+ return parseIntelOperand(Operands, Name);
- return ParseATTOperand(Operands);
+ return parseATTOperand(Operands);
}
bool X86AsmParser::CreateMemForMSInlineAsm(
@@ -1759,8 +1776,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
// registers in a mmory expression, and though unaccessible via rip/eip.
if (IsGlobalLV && (BaseReg || IndexReg)) {
Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
- End, Size, Identifier, Decl,
- FrontendSize));
+ End, Size, Identifier, Decl, 0,
+ BaseReg && IndexReg));
return false;
}
// Otherwise, we set the base register to a non-zero value
@@ -1841,11 +1858,25 @@ bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
return true;
}
+// Check if current intel expression append after an operand.
+// Like: [Operand][Intel Expression]
+void X86AsmParser::tryParseOperandIdx(AsmToken::TokenKind PrevTK,
+ IntelExprStateMachine &SM) {
+ if (PrevTK != AsmToken::RBrac)
+ return;
+
+ SM.setAppendAfterOperand();
+}
+
bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
MCAsmParser &Parser = getParser();
StringRef ErrMsg;
AsmToken::TokenKind PrevTK = AsmToken::Error;
+
+ if (getContext().getObjectFileInfo()->isPositionIndependent())
+ SM.setPIC();
+
bool Done = false;
while (!Done) {
// Get a fresh reference on each loop iteration in case the previous
@@ -2123,10 +2154,12 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
case AsmToken::LBrac:
if (SM.onLBrac())
return Error(Tok.getLoc(), "unexpected bracket encountered");
+ tryParseOperandIdx(PrevTK, SM);
break;
case AsmToken::RBrac:
- if (SM.onRBrac())
- return Error(Tok.getLoc(), "unexpected bracket encountered");
+ if (SM.onRBrac(ErrMsg)) {
+ return Error(Tok.getLoc(), ErrMsg);
+ }
break;
case AsmToken::LParen: SM.onLParen(); break;
case AsmToken::RParen: SM.onRParen(); break;
@@ -2477,7 +2510,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
return false;
}
-bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
+bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
SMLoc Start, End;
@@ -2552,6 +2585,8 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
StringRef ErrMsg;
unsigned BaseReg = SM.getBaseReg();
unsigned IndexReg = SM.getIndexReg();
+ if (IndexReg && BaseReg == X86::RIP)
+ BaseReg = 0;
unsigned Scale = SM.getScale();
if (!PtrInOperand)
Size = SM.getElementSize() << 3;
@@ -2597,25 +2632,49 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
// When parsing x64 MS-style assembly, all non-absolute references to a named
// variable default to RIP-relative.
- if (Parser.isParsingMasm() && is64BitMode() && SM.getElementSize() > 0) {
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
- BaseReg, IndexReg, Scale, Start,
- End, Size,
- /*DefaultBaseReg=*/X86::RIP));
- return false;
+ unsigned DefaultBaseReg = X86::NoRegister;
+ bool MaybeDirectBranchDest = true;
+
+ if (Parser.isParsingMasm()) {
+ bool IsUnconditionalBranch =
+ Name.equals_insensitive("jmp") || Name.equals_insensitive("call");
+ if (is64BitMode() && SM.getElementSize() > 0) {
+ DefaultBaseReg = X86::RIP;
+ }
+ if (IsUnconditionalBranch) {
+ if (PtrInOperand) {
+ MaybeDirectBranchDest = false;
+ if (is64BitMode())
+ DefaultBaseReg = X86::RIP;
+ } else if (!BaseReg && !IndexReg && Disp &&
+ Disp->getKind() == MCExpr::SymbolRef) {
+ if (is64BitMode()) {
+ if (SM.getSize() == 8) {
+ MaybeDirectBranchDest = false;
+ DefaultBaseReg = X86::RIP;
+ }
+ } else {
+ if (SM.getSize() == 4 || SM.getSize() == 2)
+ MaybeDirectBranchDest = false;
+ }
+ }
+ }
}
- if ((BaseReg || IndexReg || RegNo))
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
- BaseReg, IndexReg, Scale, Start,
- End, Size));
+ if ((BaseReg || IndexReg || RegNo || DefaultBaseReg != X86::NoRegister))
+ Operands.push_back(X86Operand::CreateMem(
+ getPointerWidth(), RegNo, Disp, BaseReg, IndexReg, Scale, Start, End,
+ Size, DefaultBaseReg, /*SymName=*/StringRef(), /*OpDecl=*/nullptr,
+ /*FrontendSize=*/0, /*UseUpRegs=*/false, MaybeDirectBranchDest));
else
- Operands.push_back(
- X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+ Operands.push_back(X86Operand::CreateMem(
+ getPointerWidth(), Disp, Start, End, Size, /*SymName=*/StringRef(),
+ /*OpDecl=*/nullptr, /*FrontendSize=*/0, /*UseUpRegs=*/false,
+ MaybeDirectBranchDest));
return false;
}
-bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
+bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
switch (getLexer().getKind()) {
case AsmToken::Dollar: {
@@ -2722,7 +2781,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
if (!getLexer().is(AsmToken::RCurly))
return Error(getLexer().getLoc(), "Expected } at this point");
Parser.Lex(); // Eat '}'
- // Assign Z with the {z} mark opernad
+ // Assign Z with the {z} mark operand
Z = X86Operand::CreateToken("{z}", StartLoc);
return false;
}
@@ -3346,7 +3405,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Name = Next;
PatchedName = Name;
- ForcedDataPrefix = X86::Mode32Bit;
+ ForcedDataPrefix = X86::Is32Bit;
IsPrefix = false;
}
}
@@ -3371,7 +3430,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Read the operands.
while (true) {
- if (ParseOperand(Operands))
+ if (parseOperand(Operands, Name))
return true;
if (HandleAVX512Operand(Operands))
return true;
@@ -3774,84 +3833,27 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
}
bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
+ using namespace X86;
const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-
- switch (Inst.getOpcode()) {
- case X86::VGATHERDPDYrm:
- case X86::VGATHERDPDrm:
- case X86::VGATHERDPSYrm:
- case X86::VGATHERDPSrm:
- case X86::VGATHERQPDYrm:
- case X86::VGATHERQPDrm:
- case X86::VGATHERQPSYrm:
- case X86::VGATHERQPSrm:
- case X86::VPGATHERDDYrm:
- case X86::VPGATHERDDrm:
- case X86::VPGATHERDQYrm:
- case X86::VPGATHERDQrm:
- case X86::VPGATHERQDYrm:
- case X86::VPGATHERQDrm:
- case X86::VPGATHERQQYrm:
- case X86::VPGATHERQQrm: {
- unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
- unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
- unsigned Index =
- MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg());
- if (Dest == Mask || Dest == Index || Mask == Index)
- return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
- "registers should be distinct");
- break;
- }
- case X86::VGATHERDPDZ128rm:
- case X86::VGATHERDPDZ256rm:
- case X86::VGATHERDPDZrm:
- case X86::VGATHERDPSZ128rm:
- case X86::VGATHERDPSZ256rm:
- case X86::VGATHERDPSZrm:
- case X86::VGATHERQPDZ128rm:
- case X86::VGATHERQPDZ256rm:
- case X86::VGATHERQPDZrm:
- case X86::VGATHERQPSZ128rm:
- case X86::VGATHERQPSZ256rm:
- case X86::VGATHERQPSZrm:
- case X86::VPGATHERDDZ128rm:
- case X86::VPGATHERDDZ256rm:
- case X86::VPGATHERDDZrm:
- case X86::VPGATHERDQZ128rm:
- case X86::VPGATHERDQZ256rm:
- case X86::VPGATHERDQZrm:
- case X86::VPGATHERQDZ128rm:
- case X86::VPGATHERQDZ256rm:
- case X86::VPGATHERQDZrm:
- case X86::VPGATHERQQZ128rm:
- case X86::VPGATHERQQZ256rm:
- case X86::VPGATHERQQZrm: {
- unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
- unsigned Index =
- MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg());
- if (Dest == Index)
- return Warning(Ops[0]->getStartLoc(), "index and destination registers "
- "should be distinct");
- break;
- }
- case X86::V4FMADDPSrm:
- case X86::V4FMADDPSrmk:
- case X86::V4FMADDPSrmkz:
- case X86::V4FMADDSSrm:
- case X86::V4FMADDSSrmk:
- case X86::V4FMADDSSrmkz:
- case X86::V4FNMADDPSrm:
- case X86::V4FNMADDPSrmk:
- case X86::V4FNMADDPSrmkz:
- case X86::V4FNMADDSSrm:
- case X86::V4FNMADDSSrmk:
- case X86::V4FNMADDSSrmkz:
- case X86::VP4DPWSSDSrm:
- case X86::VP4DPWSSDSrmk:
- case X86::VP4DPWSSDSrmkz:
- case X86::VP4DPWSSDrm:
- case X86::VP4DPWSSDrmk:
- case X86::VP4DPWSSDrmkz: {
+ unsigned Opcode = Inst.getOpcode();
+ uint64_t TSFlags = MII.get(Opcode).TSFlags;
+ if (isVFCMADDCPH(Opcode) || isVFCMADDCSH(Opcode) || isVFMADDCPH(Opcode) ||
+ isVFMADDCSH(Opcode)) {
+ unsigned Dest = Inst.getOperand(0).getReg();
+ for (unsigned i = 2; i < Inst.getNumOperands(); i++)
+ if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+ return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+ "distinct from source registers");
+ } else if (isVFCMULCPH(Opcode) || isVFCMULCSH(Opcode) || isVFMULCPH(Opcode) ||
+ isVFMULCSH(Opcode)) {
+ unsigned Dest = Inst.getOperand(0).getReg();
+ for (unsigned i = 1; i < Inst.getNumOperands(); i++)
+ if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+ return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+ "distinct from source registers");
+ } else if (isV4FMADDPS(Opcode) || isV4FMADDSS(Opcode) ||
+ isV4FNMADDPS(Opcode) || isV4FNMADDSS(Opcode) ||
+ isVP4DPWSSDS(Opcode) || isVP4DPWSSD(Opcode)) {
unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
X86::AddrNumOperands - 1).getReg();
unsigned Src2Enc = MRI->getEncodingValue(Src2);
@@ -3865,186 +3867,34 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
RegName.take_front(3) + Twine(GroupEnd) +
"' source group");
}
- break;
- }
- case X86::VFCMADDCPHZ128m:
- case X86::VFCMADDCPHZ256m:
- case X86::VFCMADDCPHZm:
- case X86::VFCMADDCPHZ128mb:
- case X86::VFCMADDCPHZ256mb:
- case X86::VFCMADDCPHZmb:
- case X86::VFCMADDCPHZ128mbk:
- case X86::VFCMADDCPHZ256mbk:
- case X86::VFCMADDCPHZmbk:
- case X86::VFCMADDCPHZ128mbkz:
- case X86::VFCMADDCPHZ256mbkz:
- case X86::VFCMADDCPHZmbkz:
- case X86::VFCMADDCPHZ128mk:
- case X86::VFCMADDCPHZ256mk:
- case X86::VFCMADDCPHZmk:
- case X86::VFCMADDCPHZ128mkz:
- case X86::VFCMADDCPHZ256mkz:
- case X86::VFCMADDCPHZmkz:
- case X86::VFCMADDCPHZ128r:
- case X86::VFCMADDCPHZ256r:
- case X86::VFCMADDCPHZr:
- case X86::VFCMADDCPHZ128rk:
- case X86::VFCMADDCPHZ256rk:
- case X86::VFCMADDCPHZrk:
- case X86::VFCMADDCPHZ128rkz:
- case X86::VFCMADDCPHZ256rkz:
- case X86::VFCMADDCPHZrkz:
- case X86::VFCMADDCPHZrb:
- case X86::VFCMADDCPHZrbk:
- case X86::VFCMADDCPHZrbkz:
- case X86::VFCMADDCSHZm:
- case X86::VFCMADDCSHZmk:
- case X86::VFCMADDCSHZmkz:
- case X86::VFCMADDCSHZr:
- case X86::VFCMADDCSHZrb:
- case X86::VFCMADDCSHZrbk:
- case X86::VFCMADDCSHZrbkz:
- case X86::VFCMADDCSHZrk:
- case X86::VFCMADDCSHZrkz:
- case X86::VFMADDCPHZ128m:
- case X86::VFMADDCPHZ256m:
- case X86::VFMADDCPHZm:
- case X86::VFMADDCPHZ128mb:
- case X86::VFMADDCPHZ256mb:
- case X86::VFMADDCPHZmb:
- case X86::VFMADDCPHZ128mbk:
- case X86::VFMADDCPHZ256mbk:
- case X86::VFMADDCPHZmbk:
- case X86::VFMADDCPHZ128mbkz:
- case X86::VFMADDCPHZ256mbkz:
- case X86::VFMADDCPHZmbkz:
- case X86::VFMADDCPHZ128mk:
- case X86::VFMADDCPHZ256mk:
- case X86::VFMADDCPHZmk:
- case X86::VFMADDCPHZ128mkz:
- case X86::VFMADDCPHZ256mkz:
- case X86::VFMADDCPHZmkz:
- case X86::VFMADDCPHZ128r:
- case X86::VFMADDCPHZ256r:
- case X86::VFMADDCPHZr:
- case X86::VFMADDCPHZ128rk:
- case X86::VFMADDCPHZ256rk:
- case X86::VFMADDCPHZrk:
- case X86::VFMADDCPHZ128rkz:
- case X86::VFMADDCPHZ256rkz:
- case X86::VFMADDCPHZrkz:
- case X86::VFMADDCPHZrb:
- case X86::VFMADDCPHZrbk:
- case X86::VFMADDCPHZrbkz:
- case X86::VFMADDCSHZm:
- case X86::VFMADDCSHZmk:
- case X86::VFMADDCSHZmkz:
- case X86::VFMADDCSHZr:
- case X86::VFMADDCSHZrb:
- case X86::VFMADDCSHZrbk:
- case X86::VFMADDCSHZrbkz:
- case X86::VFMADDCSHZrk:
- case X86::VFMADDCSHZrkz: {
- unsigned Dest = Inst.getOperand(0).getReg();
- for (unsigned i = 2; i < Inst.getNumOperands(); i++)
- if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
- return Warning(Ops[0]->getStartLoc(), "Destination register should be "
- "distinct from source registers");
- break;
- }
- case X86::VFCMULCPHZ128rm:
- case X86::VFCMULCPHZ256rm:
- case X86::VFCMULCPHZrm:
- case X86::VFCMULCPHZ128rmb:
- case X86::VFCMULCPHZ256rmb:
- case X86::VFCMULCPHZrmb:
- case X86::VFCMULCPHZ128rmbk:
- case X86::VFCMULCPHZ256rmbk:
- case X86::VFCMULCPHZrmbk:
- case X86::VFCMULCPHZ128rmbkz:
- case X86::VFCMULCPHZ256rmbkz:
- case X86::VFCMULCPHZrmbkz:
- case X86::VFCMULCPHZ128rmk:
- case X86::VFCMULCPHZ256rmk:
- case X86::VFCMULCPHZrmk:
- case X86::VFCMULCPHZ128rmkz:
- case X86::VFCMULCPHZ256rmkz:
- case X86::VFCMULCPHZrmkz:
- case X86::VFCMULCPHZ128rr:
- case X86::VFCMULCPHZ256rr:
- case X86::VFCMULCPHZrr:
- case X86::VFCMULCPHZ128rrk:
- case X86::VFCMULCPHZ256rrk:
- case X86::VFCMULCPHZrrk:
- case X86::VFCMULCPHZ128rrkz:
- case X86::VFCMULCPHZ256rrkz:
- case X86::VFCMULCPHZrrkz:
- case X86::VFCMULCPHZrrb:
- case X86::VFCMULCPHZrrbk:
- case X86::VFCMULCPHZrrbkz:
- case X86::VFCMULCSHZrm:
- case X86::VFCMULCSHZrmk:
- case X86::VFCMULCSHZrmkz:
- case X86::VFCMULCSHZrr:
- case X86::VFCMULCSHZrrb:
- case X86::VFCMULCSHZrrbk:
- case X86::VFCMULCSHZrrbkz:
- case X86::VFCMULCSHZrrk:
- case X86::VFCMULCSHZrrkz:
- case X86::VFMULCPHZ128rm:
- case X86::VFMULCPHZ256rm:
- case X86::VFMULCPHZrm:
- case X86::VFMULCPHZ128rmb:
- case X86::VFMULCPHZ256rmb:
- case X86::VFMULCPHZrmb:
- case X86::VFMULCPHZ128rmbk:
- case X86::VFMULCPHZ256rmbk:
- case X86::VFMULCPHZrmbk:
- case X86::VFMULCPHZ128rmbkz:
- case X86::VFMULCPHZ256rmbkz:
- case X86::VFMULCPHZrmbkz:
- case X86::VFMULCPHZ128rmk:
- case X86::VFMULCPHZ256rmk:
- case X86::VFMULCPHZrmk:
- case X86::VFMULCPHZ128rmkz:
- case X86::VFMULCPHZ256rmkz:
- case X86::VFMULCPHZrmkz:
- case X86::VFMULCPHZ128rr:
- case X86::VFMULCPHZ256rr:
- case X86::VFMULCPHZrr:
- case X86::VFMULCPHZ128rrk:
- case X86::VFMULCPHZ256rrk:
- case X86::VFMULCPHZrrk:
- case X86::VFMULCPHZ128rrkz:
- case X86::VFMULCPHZ256rrkz:
- case X86::VFMULCPHZrrkz:
- case X86::VFMULCPHZrrb:
- case X86::VFMULCPHZrrbk:
- case X86::VFMULCPHZrrbkz:
- case X86::VFMULCSHZrm:
- case X86::VFMULCSHZrmk:
- case X86::VFMULCSHZrmkz:
- case X86::VFMULCSHZrr:
- case X86::VFMULCSHZrrb:
- case X86::VFMULCSHZrrbk:
- case X86::VFMULCSHZrrbkz:
- case X86::VFMULCSHZrrk:
- case X86::VFMULCSHZrrkz: {
- unsigned Dest = Inst.getOperand(0).getReg();
- for (unsigned i = 1; i < Inst.getNumOperands(); i++)
- if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
- return Warning(Ops[0]->getStartLoc(), "Destination register should be "
- "distinct from source registers");
- break;
- }
+ } else if (isVGATHERDPD(Opcode) || isVGATHERDPS(Opcode) ||
+ isVGATHERQPD(Opcode) || isVGATHERQPS(Opcode) ||
+ isVPGATHERDD(Opcode) || isVPGATHERDQ(Opcode) ||
+ isVPGATHERQD(Opcode) || isVPGATHERQQ(Opcode)) {
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+ if (HasEVEX) {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Index = MRI->getEncodingValue(
+ Inst.getOperand(4 + X86::AddrIndexReg).getReg());
+ if (Dest == Index)
+ return Warning(Ops[0]->getStartLoc(), "index and destination registers "
+ "should be distinct");
+ } else {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ unsigned Index = MRI->getEncodingValue(
+ Inst.getOperand(3 + X86::AddrIndexReg).getReg());
+ if (Dest == Mask || Dest == Index || Mask == Index)
+ return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
+ "registers should be distinct");
+ }
}
- const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
// Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
// check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
- if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+ if ((TSFlags & X86II::EncodingMask) == 0) {
MCPhysReg HReg = X86::NoRegister;
- bool UsesRex = MCID.TSFlags & X86II::REX_W;
+ bool UsesRex = TSFlags & X86II::REX_W;
unsigned NumOps = Inst.getNumOperands();
for (unsigned i = 0; i != NumOps; ++i) {
const MCOperand &MO = Inst.getOperand(i);
@@ -4313,15 +4163,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
// when matching the instruction.
- if (ForcedDataPrefix == X86::Mode32Bit)
- SwitchMode(X86::Mode32Bit);
+ if (ForcedDataPrefix == X86::Is32Bit)
+ SwitchMode(X86::Is32Bit);
// First, try a direct match.
FeatureBitset MissingFeatures;
unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
MissingFeatures, MatchingInlineAsm,
isParsingIntelSyntax());
- if (ForcedDataPrefix == X86::Mode32Bit) {
- SwitchMode(X86::Mode16Bit);
+ if (ForcedDataPrefix == X86::Is32Bit) {
+ SwitchMode(X86::Is16Bit);
ForcedDataPrefix = 0;
}
switch (OriginalError) {
@@ -4840,8 +4690,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) {
if (getParser().parseAbsoluteExpression(Control))
return true;
}
- if (getParser().parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.nops' directive"))
+ if (getParser().parseEOL())
return true;
if (NumBytes <= 0) {
@@ -4863,7 +4712,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) {
/// parseDirectiveEven
/// ::= .even
bool X86AsmParser::parseDirectiveEven(SMLoc L) {
- if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+ if (parseEOL())
return false;
const MCSection *Section = getStreamer().getCurrentSectionOnly();
@@ -4871,7 +4720,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
getStreamer().initSections(false, getSTI());
Section = getStreamer().getCurrentSectionOnly();
}
- if (Section->UseCodeAlign())
+ if (Section->useCodeAlign())
getStreamer().emitCodeAlignment(2, &getSTI(), 0);
else
getStreamer().emitValueToAlignment(2, 0, 1, 0);
@@ -4886,7 +4735,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
if (IDVal == ".code16") {
Parser.Lex();
if (!is16BitMode()) {
- SwitchMode(X86::Mode16Bit);
+ SwitchMode(X86::Is16Bit);
getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
}
} else if (IDVal == ".code16gcc") {
@@ -4894,19 +4743,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
Parser.Lex();
Code16GCC = true;
if (!is16BitMode()) {
- SwitchMode(X86::Mode16Bit);
+ SwitchMode(X86::Is16Bit);
getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
}
} else if (IDVal == ".code32") {
Parser.Lex();
if (!is32BitMode()) {
- SwitchMode(X86::Mode32Bit);
+ SwitchMode(X86::Is32Bit);
getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
}
} else if (IDVal == ".code64") {
Parser.Lex();
if (!is64BitMode()) {
- SwitchMode(X86::Mode64Bit);
+ SwitchMode(X86::Is64Bit);
getParser().getStreamer().emitAssemblerFlag(MCAF_Code64);
}
} else {
@@ -5035,7 +4884,7 @@ bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
return TokError("unexpected token in directive");
getParser().Lex();
- getStreamer().EmitWinCFIPushReg(Reg, Loc);
+ getStreamer().emitWinCFIPushReg(Reg, Loc);
return false;
}
@@ -5055,7 +4904,7 @@ bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
return TokError("unexpected token in directive");
getParser().Lex();
- getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+ getStreamer().emitWinCFISetFrame(Reg, Off, Loc);
return false;
}
@@ -5075,7 +4924,7 @@ bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
return TokError("unexpected token in directive");
getParser().Lex();
- getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+ getStreamer().emitWinCFISaveReg(Reg, Off, Loc);
return false;
}
@@ -5095,7 +4944,7 @@ bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
return TokError("unexpected token in directive");
getParser().Lex();
- getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+ getStreamer().emitWinCFISaveXMM(Reg, Off, Loc);
return false;
}
@@ -5116,7 +4965,7 @@ bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
return TokError("unexpected token in directive");
getParser().Lex();
- getStreamer().EmitWinCFIPushFrame(Code, Loc);
+ getStreamer().emitWinCFIPushFrame(Code, Loc);
return false;
}
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 67b1244708a8..075b800f9e20 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -17,6 +17,8 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/SMLoc.h"
#include <cassert>
@@ -35,6 +37,10 @@ struct X86Operand final : public MCParsedAsmOperand {
void *OpDecl;
bool AddressOf;
+ /// This used for inline asm which may specify base reg and index reg for
+ /// MemOp. e.g. ARR[eax + ecx*4], so no extra reg can be used for MemOp.
+ bool UseUpRegs = false;
+
struct TokOp {
const char *Data;
unsigned Length;
@@ -66,6 +72,11 @@ struct X86Operand final : public MCParsedAsmOperand {
/// If the memory operand is unsized and there are multiple instruction
/// matches, prefer the one with this size.
unsigned FrontendSize;
+
+ /// If false, then this operand must be a memory operand for an indirect
+ /// branch instruction. Otherwise, this operand may belong to either a
+ /// direct or indirect branch instruction.
+ bool MaybeDirectBranchDest;
};
union {
@@ -203,6 +214,10 @@ struct X86Operand final : public MCParsedAsmOperand {
assert(Kind == Memory && "Invalid access!");
return Mem.FrontendSize;
}
+ bool isMaybeDirectBranchDest() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.MaybeDirectBranchDest;
+ }
bool isToken() const override {return Kind == Token; }
@@ -285,12 +300,6 @@ struct X86Operand final : public MCParsedAsmOperand {
bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; }
- bool isMemPlaceholder(const MCInstrDesc &Desc) const override {
- // Only MS InlineAsm uses global variables with registers rather than
- // rip/eip.
- return isMem() && !Mem.DefaultBaseReg && Mem.FrontendSize;
- }
-
bool needAddressOf() const override { return AddressOf; }
bool isMem() const override { return Kind == Memory; }
@@ -374,8 +383,9 @@ struct X86Operand final : public MCParsedAsmOperand {
bool isAbsMem() const {
return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
- !getMemIndexReg() && getMemScale() == 1;
+ !getMemIndexReg() && getMemScale() == 1 && isMaybeDirectBranchDest();
}
+
bool isAVX512RC() const{
return isImm();
}
@@ -384,6 +394,8 @@ struct X86Operand final : public MCParsedAsmOperand {
return isAbsMem() && Mem.ModeSize == 16;
}
+ bool isMemUseUpRegs() const override { return UseUpRegs; }
+
bool isSrcIdx() const {
return !getMemIndexReg() && getMemScale() == 1 &&
(getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
@@ -669,7 +681,8 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand>
CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
unsigned Size = 0, StringRef SymName = StringRef(),
- void *OpDecl = nullptr, unsigned FrontendSize = 0) {
+ void *OpDecl = nullptr, unsigned FrontendSize = 0,
+ bool UseUpRegs = false, bool MaybeDirectBranchDest = true) {
auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
@@ -680,6 +693,8 @@ struct X86Operand final : public MCParsedAsmOperand {
Res->Mem.Size = Size;
Res->Mem.ModeSize = ModeSize;
Res->Mem.FrontendSize = FrontendSize;
+ Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest;
+ Res->UseUpRegs = UseUpRegs;
Res->SymName = SymName;
Res->OpDecl = OpDecl;
Res->AddressOf = false;
@@ -693,7 +708,8 @@ struct X86Operand final : public MCParsedAsmOperand {
SMLoc EndLoc, unsigned Size = 0,
unsigned DefaultBaseReg = X86::NoRegister,
StringRef SymName = StringRef(), void *OpDecl = nullptr,
- unsigned FrontendSize = 0) {
+ unsigned FrontendSize = 0, bool UseUpRegs = false,
+ bool MaybeDirectBranchDest = true) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) &&
@@ -712,6 +728,8 @@ struct X86Operand final : public MCParsedAsmOperand {
Res->Mem.Size = Size;
Res->Mem.ModeSize = ModeSize;
Res->Mem.FrontendSize = FrontendSize;
+ Res->Mem.MaybeDirectBranchDest = MaybeDirectBranchDest;
+ Res->UseUpRegs = UseUpRegs;
Res->SymName = SymName;
Res->OpDecl = OpDecl;
Res->AddressOf = false;
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 908eb6d1fab1..1da6bf86397e 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -493,16 +493,15 @@ static int readPrefixes(struct InternalInstruction *insn) {
insn->displacementSize = (insn->hasAdSize ? 2 : 4);
insn->immediateSize = (insn->hasOpSize ? 2 : 4);
} else if (insn->mode == MODE_64BIT) {
+ insn->displacementSize = 4;
if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
insn->registerSize = 8;
insn->addressSize = (insn->hasAdSize ? 4 : 8);
- insn->displacementSize = 4;
insn->immediateSize = 4;
insn->hasOpSize = false;
} else {
insn->registerSize = (insn->hasOpSize ? 2 : 4);
insn->addressSize = (insn->hasAdSize ? 4 : 8);
- insn->displacementSize = (insn->hasOpSize ? 2 : 4);
insn->immediateSize = (insn->hasOpSize ? 2 : 4);
}
}
@@ -1722,13 +1721,13 @@ X86GenericDisassembler::X86GenericDisassembler(
std::unique_ptr<const MCInstrInfo> MII)
: MCDisassembler(STI, Ctx), MII(std::move(MII)) {
const FeatureBitset &FB = STI.getFeatureBits();
- if (FB[X86::Mode16Bit]) {
+ if (FB[X86::Is16Bit]) {
fMode = MODE_16BIT;
return;
- } else if (FB[X86::Mode32Bit]) {
+ } else if (FB[X86::Is32Bit]) {
fMode = MODE_32BIT;
return;
- } else if (FB[X86::Mode64Bit]) {
+ } else if (FB[X86::Is64Bit]) {
fMode = MODE_64BIT;
return;
}
@@ -1801,46 +1800,6 @@ static void translateRegister(MCInst &mcInst, Reg reg) {
mcInst.addOperand(MCOperand::createReg(llvmRegnum));
}
-/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
-/// immediate Value in the MCInst.
-///
-/// @param Value - The immediate Value, has had any PC adjustment made by
-/// the caller.
-/// @param isBranch - If the instruction is a branch instruction
-/// @param Address - The starting address of the instruction
-/// @param Offset - The byte offset to this immediate in the instruction
-/// @param Width - The byte width of this immediate in the instruction
-///
-/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
-/// called then that function is called to get any symbolic information for the
-/// immediate in the instruction using the Address, Offset and Width. If that
-/// returns non-zero then the symbolic information it returns is used to create
-/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo()
-/// returns zero and isBranch is true then a symbol look up for immediate Value
-/// is done and if a symbol is found an MCExpr is created with that, else
-/// an MCExpr with the immediate Value is created. This function returns true
-/// if it adds an operand to the MCInst and false otherwise.
-static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
- uint64_t Address, uint64_t Offset,
- uint64_t Width, MCInst &MI,
- const MCDisassembler *Dis) {
- return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
- Offset, Width);
-}
-
-/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
-/// referenced by a load instruction with the base register that is the rip.
-/// These can often be addresses in a literal pool. The Address of the
-/// instruction and its immediate Value are used to determine the address
-/// being referenced in the literal pool entry. The SymbolLookUp call back will
-/// return a pointer to a literal 'C' string if the referenced address is an
-/// address into a section with 'C' string literals.
-static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
- const void *Decoder) {
- const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
- Dis->tryAddingPcLoadReferenceComment(Value, Address);
-}
-
static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
0, // SEG_OVERRIDE_NONE
X86::CS,
@@ -1914,8 +1873,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
uint64_t pcrel = 0;
if (type == TYPE_REL) {
isBranch = true;
- pcrel = insn.startLocation +
- insn.immediateOffset + insn.immediateSize;
+ pcrel = insn.startLocation + insn.length;
switch (operand.encoding) {
default:
break;
@@ -1990,9 +1948,9 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
break;
}
- if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
- insn.immediateOffset, insn.immediateSize,
- mcInst, Dis))
+ if (!Dis->tryAddingSymbolicOperand(
+ mcInst, immediate + pcrel, insn.startLocation, isBranch,
+ insn.immediateOffset, insn.immediateSize, insn.length))
mcInst.addOperand(MCOperand::createImm(immediate));
if (type == TYPE_MOFFS) {
@@ -2129,11 +2087,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
return true;
}
if (insn.mode == MODE_64BIT){
- pcrel = insn.startLocation +
- insn.displacementOffset + insn.displacementSize;
- tryAddingPcLoadReferenceComment(insn.startLocation +
- insn.displacementOffset,
- insn.displacement + pcrel, Dis);
+ pcrel = insn.startLocation + insn.length;
+ Dis->tryAddingPcLoadReferenceComment(insn.displacement + pcrel,
+ insn.startLocation +
+ insn.displacementOffset);
// Section 2.2.1.6
baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
X86::RIP);
@@ -2193,9 +2150,13 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
mcInst.addOperand(baseReg);
mcInst.addOperand(scaleAmount);
mcInst.addOperand(indexReg);
- if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
- insn.startLocation, insn.displacementOffset,
- insn.displacementSize, mcInst, Dis))
+
+ const uint8_t dispSize =
+ (insn.eaDisplacement == EA_DISP_NONE) ? 0 : insn.displacementSize;
+
+ if (!Dis->tryAddingSymbolicOperand(
+ mcInst, insn.displacement + pcrel, insn.startLocation, false,
+ insn.displacementOffset, dispSize, insn.length))
mcInst.addOperand(displacement);
mcInst.addOperand(segmentReg);
return false;
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index 24d26751f0a1..61e1b6b27a85 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -35,7 +35,7 @@ public:
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
: InstrPostProcess(STI, MCII) {}
- ~X86InstrPostProcess() {}
+ ~X86InstrPostProcess() = default;
void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
const MCInst &MCI) override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index baacf2f46183..6fd3db4515ec 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -46,7 +46,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (CommentStream)
HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
- printInstFlags(MI, OS);
+ printInstFlags(MI, OS, STI);
// Output CALLpcrel32 as "callq" in 64-bit mode.
// In Intel annotation it's always emitted as "call".
@@ -55,7 +55,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
// InstrInfo.td as soon as Requires clause is supported properly
// for InstAlias.
if (MI->getOpcode() == X86::CALLpcrel32 &&
- (STI.getFeatureBits()[X86::Mode64Bit])) {
+ (STI.getFeatureBits()[X86::Is64Bit])) {
OS << "\tcallq\t";
printPCRelImm(MI, Address, 0, OS);
}
@@ -65,8 +65,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
// 0x66 to be interpreted as "data16" by the asm printer.
// Thus we add an adjustment here in order to print the "right" instruction.
else if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
- OS << "\tdata32";
+ STI.getFeatureBits()[X86::Is16Bit]) {
+ OS << "\tdata32";
}
// Try to print any aliases first.
else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3df48b466d07..2d92b8d5b574 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -8,6 +8,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86InstrRelaxTables.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MachO.h"
@@ -222,87 +223,7 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
unsigned Op = Inst.getOpcode();
- switch (Op) {
- default:
- return Op;
-
- // IMUL
- case X86::IMUL16rri8: return X86::IMUL16rri;
- case X86::IMUL16rmi8: return X86::IMUL16rmi;
- case X86::IMUL32rri8: return X86::IMUL32rri;
- case X86::IMUL32rmi8: return X86::IMUL32rmi;
- case X86::IMUL64rri8: return X86::IMUL64rri32;
- case X86::IMUL64rmi8: return X86::IMUL64rmi32;
-
- // AND
- case X86::AND16ri8: return X86::AND16ri;
- case X86::AND16mi8: return X86::AND16mi;
- case X86::AND32ri8: return X86::AND32ri;
- case X86::AND32mi8: return X86::AND32mi;
- case X86::AND64ri8: return X86::AND64ri32;
- case X86::AND64mi8: return X86::AND64mi32;
-
- // OR
- case X86::OR16ri8: return X86::OR16ri;
- case X86::OR16mi8: return X86::OR16mi;
- case X86::OR32ri8: return X86::OR32ri;
- case X86::OR32mi8: return X86::OR32mi;
- case X86::OR64ri8: return X86::OR64ri32;
- case X86::OR64mi8: return X86::OR64mi32;
-
- // XOR
- case X86::XOR16ri8: return X86::XOR16ri;
- case X86::XOR16mi8: return X86::XOR16mi;
- case X86::XOR32ri8: return X86::XOR32ri;
- case X86::XOR32mi8: return X86::XOR32mi;
- case X86::XOR64ri8: return X86::XOR64ri32;
- case X86::XOR64mi8: return X86::XOR64mi32;
-
- // ADD
- case X86::ADD16ri8: return X86::ADD16ri;
- case X86::ADD16mi8: return X86::ADD16mi;
- case X86::ADD32ri8: return X86::ADD32ri;
- case X86::ADD32mi8: return X86::ADD32mi;
- case X86::ADD64ri8: return X86::ADD64ri32;
- case X86::ADD64mi8: return X86::ADD64mi32;
-
- // ADC
- case X86::ADC16ri8: return X86::ADC16ri;
- case X86::ADC16mi8: return X86::ADC16mi;
- case X86::ADC32ri8: return X86::ADC32ri;
- case X86::ADC32mi8: return X86::ADC32mi;
- case X86::ADC64ri8: return X86::ADC64ri32;
- case X86::ADC64mi8: return X86::ADC64mi32;
-
- // SUB
- case X86::SUB16ri8: return X86::SUB16ri;
- case X86::SUB16mi8: return X86::SUB16mi;
- case X86::SUB32ri8: return X86::SUB32ri;
- case X86::SUB32mi8: return X86::SUB32mi;
- case X86::SUB64ri8: return X86::SUB64ri32;
- case X86::SUB64mi8: return X86::SUB64mi32;
-
- // SBB
- case X86::SBB16ri8: return X86::SBB16ri;
- case X86::SBB16mi8: return X86::SBB16mi;
- case X86::SBB32ri8: return X86::SBB32ri;
- case X86::SBB32mi8: return X86::SBB32mi;
- case X86::SBB64ri8: return X86::SBB64ri32;
- case X86::SBB64mi8: return X86::SBB64mi32;
-
- // CMP
- case X86::CMP16ri8: return X86::CMP16ri;
- case X86::CMP16mi8: return X86::CMP16mi;
- case X86::CMP32ri8: return X86::CMP32ri;
- case X86::CMP32mi8: return X86::CMP32mi;
- case X86::CMP64ri8: return X86::CMP64ri32;
- case X86::CMP64mi8: return X86::CMP64mi32;
-
- // PUSH
- case X86::PUSH32i8: return X86::PUSHi32;
- case X86::PUSH16i8: return X86::PUSHi16;
- case X86::PUSH64i8: return X86::PUSH64i32;
- }
+ return X86::getRelaxedOpcodeArith(Op);
}
static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
@@ -372,7 +293,7 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst,
/// - If the instruction has a ESP/EBP base register, use SS.
/// - Otherwise use DS.
uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
- assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+ assert((STI.hasFeature(X86::Is32Bit) || STI.hasFeature(X86::Is64Bit)) &&
"Prefixes can be added only in 32-bit or 64-bit mode.");
const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
uint64_t TSFlags = Desc.TSFlags;
@@ -413,7 +334,7 @@ uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
if (SegmentReg != 0)
return X86::getSegmentOverridePrefixForReg(SegmentReg);
- if (STI.hasFeature(X86::Mode64Bit))
+ if (STI.hasFeature(X86::Is64Bit))
return X86::CS_Encoding;
if (MemoryOperand >= 0) {
@@ -572,7 +493,7 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
return false;
// Branches only need to be aligned in 32-bit or 64-bit mode.
- if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit)))
+ if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit)))
return false;
return true;
@@ -834,7 +755,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
void X86AsmBackend::relaxInstruction(MCInst &Inst,
const MCSubtargetInfo &STI) const {
// The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
- bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit];
unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
if (RelaxedOp == Inst.getOpcode()) {
@@ -853,7 +774,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst,
static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
auto &Inst = RF.getInst();
auto &STI = *RF.getSubtargetInfo();
- bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ bool Is16BitMode = STI.getFeatureBits()[X86::Is16Bit];
return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
}
@@ -1077,9 +998,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
}
unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const {
- if (STI.hasFeature(X86::Mode16Bit))
+ if (STI.hasFeature(X86::Is16Bit))
return 4;
- if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+ if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Is64Bit))
return 1;
if (STI.getFeatureBits()[X86::TuningFast7ByteNOP])
return 7;
@@ -1134,7 +1055,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
};
const char(*Nops)[11] =
- STI->getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit;
+ STI->getFeatureBits()[X86::Is16Bit] ? Nops16Bit : Nops32Bit;
uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(*STI);
@@ -1449,7 +1370,6 @@ public:
unsigned InstrOffset = 0;
unsigned StackAdjust = 0;
unsigned StackSize = 0;
- unsigned NumDefCFAOffsets = 0;
int MinAbsOffset = std::numeric_limits<int>::max();
for (const MCCFIInstruction &Inst : Instrs) {
@@ -1457,7 +1377,7 @@ public:
default:
// Any other CFI directives indicate a frame that we aren't prepared
// to represent via compact unwind, so just bail out.
- return 0;
+ return CU::UNWIND_MODE_DWARF;
case MCCFIInstruction::OpDefCfaRegister: {
// Defines a frame pointer. E.g.
//
@@ -1471,7 +1391,7 @@ public:
// generate a compact unwinding representation, so bail out.
if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
(Is64Bit ? X86::RBP : X86::EBP))
- return 0;
+ return CU::UNWIND_MODE_DWARF;
// Reset the counts.
memset(SavedRegs, 0, sizeof(SavedRegs));
@@ -1497,7 +1417,6 @@ public:
// .cfi_def_cfa_offset 80
//
StackSize = Inst.getOffset() / StackDivide;
- ++NumDefCFAOffsets;
break;
}
case MCCFIInstruction::OpOffset: {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 167580ec1ed0..e78e98cfc09e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -18,10 +18,11 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
-#include <cstdint>
+#include "llvm/Support/raw_ostream.h"
#include <cassert>
+#include <cstdint>
using namespace llvm;
@@ -349,7 +350,8 @@ void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo,
}
}
-void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
+void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O,
+ const MCSubtargetInfo &STI) {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
uint64_t TSFlags = Desc.TSFlags;
unsigned Flags = MI->getFlags();
@@ -379,6 +381,20 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
O << "\t{disp8}";
else if (Flags & X86::IP_USE_DISP32)
O << "\t{disp32}";
+
+ // Determine where the memory operand starts, if present
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand != -1)
+ MemoryOperand += X86II::getOperandBias(Desc);
+
+ // Address-Size override prefix
+ if (Flags & X86::IP_HAS_AD_SIZE &&
+ !X86_MC::needsAddressSizeOverride(*MI, STI, MemoryOperand, TSFlags)) {
+ if (STI.hasFeature(X86::Is16Bit) || STI.hasFeature(X86::Is64Bit))
+ O << "\taddr32\t";
+ else if (STI.hasFeature(X86::Is32Bit))
+ O << "\taddr16\t";
+ }
}
void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index fd82bdcd1a23..0cb5bf014b20 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -33,7 +33,8 @@ public:
raw_ostream &O);
protected:
- void printInstFlags(const MCInst *MI, raw_ostream &O);
+ void printInstFlags(const MCInst *MI, raw_ostream &O,
+ const MCSubtargetInfo &STI);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
};
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
new file mode 100644
index 000000000000..901082ce6cf3
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
@@ -0,0 +1,165 @@
+//===- X86InstrRelaxTables.cpp - X86 Instruction Relaxation Tables -*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 instruction relaxation tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrRelaxTables.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+// These tables are sorted by their ShortOp value allowing them to be binary
+// searched at runtime without the need for additional storage. The enum values
+// are currently emitted in X86GenInstrInfo.inc in alphabetical order. Which
+// makes sorting these tables a simple matter of alphabetizing the table.
+static const X86InstrRelaxTableEntry InstrRelaxTable[] = {
+ // ADC
+ { X86::ADC16mi8, X86::ADC16mi },
+ { X86::ADC16ri8, X86::ADC16ri },
+ { X86::ADC32mi8, X86::ADC32mi },
+ { X86::ADC32ri8, X86::ADC32ri },
+ { X86::ADC64mi8, X86::ADC64mi32 },
+ { X86::ADC64ri8, X86::ADC64ri32 },
+ // ADD
+ { X86::ADD16mi8, X86::ADD16mi },
+ { X86::ADD16ri8, X86::ADD16ri },
+ { X86::ADD32mi8, X86::ADD32mi },
+ { X86::ADD32ri8, X86::ADD32ri },
+ { X86::ADD64mi8, X86::ADD64mi32 },
+ { X86::ADD64ri8, X86::ADD64ri32 },
+ // AND
+ { X86::AND16mi8, X86::AND16mi },
+ { X86::AND16ri8, X86::AND16ri },
+ { X86::AND32mi8, X86::AND32mi },
+ { X86::AND32ri8, X86::AND32ri },
+ { X86::AND64mi8, X86::AND64mi32 },
+ { X86::AND64ri8, X86::AND64ri32 },
+ // CMP
+ { X86::CMP16mi8, X86::CMP16mi },
+ { X86::CMP16ri8, X86::CMP16ri },
+ { X86::CMP32mi8, X86::CMP32mi },
+ { X86::CMP32ri8, X86::CMP32ri },
+ { X86::CMP64mi8, X86::CMP64mi32 },
+ { X86::CMP64ri8, X86::CMP64ri32 },
+ // IMUL
+ { X86::IMUL16rmi8, X86::IMUL16rmi },
+ { X86::IMUL16rri8, X86::IMUL16rri },
+ { X86::IMUL32rmi8, X86::IMUL32rmi },
+ { X86::IMUL32rri8, X86::IMUL32rri },
+ { X86::IMUL64rmi8, X86::IMUL64rmi32 },
+ { X86::IMUL64rri8, X86::IMUL64rri32 },
+ // OR
+ { X86::OR16mi8, X86::OR16mi },
+ { X86::OR16ri8, X86::OR16ri },
+ { X86::OR32mi8, X86::OR32mi },
+ { X86::OR32ri8, X86::OR32ri },
+ { X86::OR64mi8, X86::OR64mi32 },
+ { X86::OR64ri8, X86::OR64ri32 },
+ // PUSH
+ { X86::PUSH16i8, X86::PUSHi16 },
+ { X86::PUSH32i8, X86::PUSHi32 },
+ { X86::PUSH64i8, X86::PUSH64i32 },
+ // SBB
+ { X86::SBB16mi8, X86::SBB16mi },
+ { X86::SBB16ri8, X86::SBB16ri },
+ { X86::SBB32mi8, X86::SBB32mi },
+ { X86::SBB32ri8, X86::SBB32ri },
+ { X86::SBB64mi8, X86::SBB64mi32 },
+ { X86::SBB64ri8, X86::SBB64ri32 },
+ // SUB
+ { X86::SUB16mi8, X86::SUB16mi },
+ { X86::SUB16ri8, X86::SUB16ri },
+ { X86::SUB32mi8, X86::SUB32mi },
+ { X86::SUB32ri8, X86::SUB32ri },
+ { X86::SUB64mi8, X86::SUB64mi32 },
+ { X86::SUB64ri8, X86::SUB64ri32 },
+ // XOR
+ { X86::XOR16mi8, X86::XOR16mi },
+ { X86::XOR16ri8, X86::XOR16ri },
+ { X86::XOR32mi8, X86::XOR32mi },
+ { X86::XOR32ri8, X86::XOR32ri },
+ { X86::XOR64mi8, X86::XOR64mi32 },
+ { X86::XOR64ri8, X86::XOR64ri32 },
+};
+
+static const X86InstrRelaxTableEntry *
+lookupRelaxTableImpl(ArrayRef<X86InstrRelaxTableEntry> Table,
+ unsigned ShortOp) {
+#ifndef NDEBUG
+ // Make sure the tables are sorted.
+ static std::atomic<bool> RelaxTableChecked(false);
+ if (!RelaxTableChecked.load(std::memory_order_relaxed)) {
+ assert(llvm::is_sorted(InstrRelaxTable) &&
+ std::adjacent_find(std::begin(InstrRelaxTable),
+ std::end(InstrRelaxTable)) ==
+ std::end(InstrRelaxTable) &&
+ "InstrRelaxTable is not sorted and unique!");
+ RelaxTableChecked.store(true, std::memory_order_relaxed);
+ }
+#endif
+
+ const X86InstrRelaxTableEntry *Data = llvm::lower_bound(Table, ShortOp);
+ if (Data != Table.end() && Data->KeyOp == ShortOp)
+ return Data;
+ return nullptr;
+}
+
+const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) {
+ return lookupRelaxTableImpl(InstrRelaxTable, ShortOp);
+}
+
+namespace {
+
+// This class stores the short form tables. It is instantiated as a
+// ManagedStatic to lazily init the short form table.
+struct X86ShortFormTable {
+ // Stores relaxation table entries sorted by relaxed form opcode.
+ SmallVector<X86InstrRelaxTableEntry, 0> Table;
+
+ X86ShortFormTable() {
+ for (const X86InstrRelaxTableEntry &Entry : InstrRelaxTable)
+ Table.push_back({Entry.DstOp, Entry.KeyOp});
+
+ llvm::sort(Table);
+
+ // Now that it's sorted, ensure its unique.
+ assert(std::adjacent_find(Table.begin(), Table.end()) == Table.end() &&
+ "Short form table is not unique!");
+ }
+};
+} // namespace
+
+static ManagedStatic<X86ShortFormTable> ShortTable;
+
+const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) {
+ auto &Table = ShortTable->Table;
+ auto I = llvm::lower_bound(Table, RelaxOp);
+ if (I != Table.end() && I->KeyOp == RelaxOp)
+ return &*I;
+ return nullptr;
+}
+
+namespace llvm {
+
+/// Get the short instruction opcode for a given relaxed opcode.
+unsigned X86::getShortOpcodeArith(unsigned RelaxOp) {
+ if (const X86InstrRelaxTableEntry *I = lookupShortTable(RelaxOp))
+ return I->DstOp;
+ return RelaxOp;
+}
+
+/// Get the relaxed instruction opcode for a given short opcode.
+unsigned X86::getRelaxedOpcodeArith(unsigned ShortOp) {
+ if (const X86InstrRelaxTableEntry *I = lookupRelaxTable(ShortOp))
+ return I->DstOp;
+ return ShortOp;
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h
new file mode 100644
index 000000000000..0551c1861a58
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.h
@@ -0,0 +1,54 @@
+//===-- X86InstrRelaxTables.h - X86 Instruction Relaxation Tables -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface to query the X86 instruction relaxation
+// tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H
+#define LLVM_LIB_TARGET_X86_X86INSTRRELAXTABLES_H
+
+#include <cstdint>
+
+namespace llvm {
+
+// This struct is used for both the relaxed and short tables. The KeyOp is used
+// to determine the sorting order.
+struct X86InstrRelaxTableEntry {
+ uint16_t KeyOp;
+ uint16_t DstOp;
+
+ bool operator<(const X86InstrRelaxTableEntry &RHS) const {
+ return KeyOp < RHS.KeyOp;
+ }
+ bool operator==(const X86InstrRelaxTableEntry &RHS) const {
+ return KeyOp == RHS.KeyOp;
+ }
+ friend bool operator<(const X86InstrRelaxTableEntry &TE, unsigned Opcode) {
+ return TE.KeyOp < Opcode;
+ }
+};
+
+/// Look up the relaxed form table entry for a given \p ShortOp.
+const X86InstrRelaxTableEntry *lookupRelaxTable(unsigned ShortOp);
+
+/// Look up the short form table entry for a given \p RelaxOp.
+const X86InstrRelaxTableEntry *lookupShortTable(unsigned RelaxOp);
+
+namespace X86 {
+
+/// Get the short instruction opcode for a given relaxed opcode.
+unsigned getShortOpcodeArith(unsigned RelaxOp);
+
+/// Get the relaxed instruction opcode for a given short opcode.
+unsigned getRelaxedOpcodeArith(unsigned ShortOp);
+} // namespace X86
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 48c335f9a777..2a2afa925a9c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -40,11 +40,11 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
- printInstFlags(MI, OS);
+ printInstFlags(MI, OS, STI);
// In 16-bit mode, print data16 as data32.
if (MI->getOpcode() == X86::DATA16_PREFIX &&
- STI.getFeatureBits()[X86::Mode16Bit]) {
+ STI.getFeatureBits()[X86::Is16Bit]) {
OS << "\tdata32";
} else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
printInstruction(MI, Address, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 4fa8bc64b245..a21bb6da86de 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -24,6 +24,7 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
@@ -155,65 +156,6 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
return MCFixup::getKindForSize(Size, isPCRel);
}
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 16-bit memory operand.
-static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
- const MCSubtargetInfo &STI) {
- const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
- const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
-
- unsigned BaseReg = Base.getReg();
- unsigned IndexReg = Index.getReg();
-
- if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
- return true;
- if ((BaseReg != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
- (IndexReg != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
- return true;
- return false;
-}
-
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 32-bit memory operand.
-static bool is32BitMemOperand(const MCInst &MI, unsigned Op) {
- const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
-
- if ((BaseReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
- (IndexReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
- return true;
- if (BaseReg.getReg() == X86::EIP) {
- assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
- return true;
- }
- if (IndexReg.getReg() == X86::EIZ)
- return true;
- return false;
-}
-
-/// \param Op operand # of the memory operand.
-///
-/// \returns true if the specified instruction has a 64-bit memory operand.
-#ifndef NDEBUG
-static bool is64BitMemOperand(const MCInst &MI, unsigned Op) {
- const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
-
- if ((BaseReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
- (IndexReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
- return true;
- return false;
-}
-#endif
-
enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff };
/// Check if this expression starts with _GLOBAL_OFFSET_TABLE_ and if it is
@@ -391,7 +333,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// Handle %rip relative addressing.
if (BaseReg == X86::RIP ||
BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
- assert(STI.hasFeature(X86::Mode64Bit) &&
+ assert(STI.hasFeature(X86::Is64Bit) &&
"Rip-relative addressing requires 64-bit mode");
assert(IndexReg.getReg() == 0 && !ForceSIB &&
"Invalid rip-relative address");
@@ -462,7 +404,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// 16-bit addressing forms of the ModR/M byte have a different encoding for
// the R/M field and are far more limited in which registers can be used.
- if (is16BitMemOperand(MI, Op, STI)) {
+ if (X86_MC::is16BitMemOperand(MI, Op, STI)) {
if (BaseReg) {
// For 32-bit addressing, the row and column values in Table 2-2 are
// basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
@@ -540,7 +482,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
BaseRegNo != N86::ESP &&
// If there is no base register and we're in 64-bit mode, we need a SIB
// byte to emit an addr that is just 'disp32' (the non-RIP relative form).
- (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
+ (!STI.hasFeature(X86::Is64Bit) || BaseReg != 0)) {
if (BaseReg == 0) { // [disp32] in X86-32 mode
emitByte(modRMByte(0, RegOpcodeField, 5), OS);
@@ -671,75 +613,29 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
emitByte(0xF2, OS);
// Emit the address size opcode prefix as needed.
- bool NeedAddressOverride;
- uint64_t AdSize = TSFlags & X86II::AdSizeMask;
- if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
- (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
- (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
- NeedAddressOverride = true;
- } else if (MemoryOperand < 0) {
- NeedAddressOverride = false;
- } else if (STI.hasFeature(X86::Mode64Bit)) {
- assert(!is16BitMemOperand(MI, MemoryOperand, STI));
- NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
- } else if (STI.hasFeature(X86::Mode32Bit)) {
- assert(!is64BitMemOperand(MI, MemoryOperand));
- NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
- } else {
- assert(STI.hasFeature(X86::Mode16Bit));
- assert(!is64BitMemOperand(MI, MemoryOperand));
- NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
- }
-
- if (NeedAddressOverride)
+ if (X86_MC::needsAddressSizeOverride(MI, STI, MemoryOperand, TSFlags) ||
+ Flags & X86::IP_HAS_AD_SIZE)
emitByte(0x67, OS);
- // Encoding type for this instruction.
- uint64_t Encoding = TSFlags & X86II::EncodingMask;
- bool HasREX = false;
- if (Encoding)
- emitVEXOpcodePrefix(MemoryOperand, MI, OS);
- else
- HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
-
uint64_t Form = TSFlags & X86II::FormMask;
switch (Form) {
default:
break;
case X86II::RawFrmDstSrc: {
- unsigned siReg = MI.getOperand(1).getReg();
- assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
- (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
- (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
- "SI and DI register sizes do not match");
// Emit segment override opcode prefix as needed (not for %ds).
if (MI.getOperand(2).getReg() != X86::DS)
emitSegmentOverridePrefix(2, MI, OS);
- // Emit AdSize prefix as needed.
- if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
- (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
- emitByte(0x67, OS);
CurOp += 3; // Consume operands.
break;
}
case X86II::RawFrmSrc: {
- unsigned siReg = MI.getOperand(0).getReg();
// Emit segment override opcode prefix as needed (not for %ds).
if (MI.getOperand(1).getReg() != X86::DS)
emitSegmentOverridePrefix(1, MI, OS);
- // Emit AdSize prefix as needed.
- if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
- (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
- emitByte(0x67, OS);
CurOp += 2; // Consume operands.
break;
}
case X86II::RawFrmDst: {
- unsigned siReg = MI.getOperand(0).getReg();
- // Emit AdSize prefix as needed.
- if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
- (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
- emitByte(0x67, OS);
++CurOp; // Consume operand.
break;
}
@@ -750,6 +646,15 @@ bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
}
}
+ // REX prefix is optional, but if used must be immediately before the opcode
+ // Encoding type for this instruction.
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool HasREX = false;
+ if (Encoding)
+ emitVEXOpcodePrefix(MemoryOperand, MI, OS);
+ else
+ HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
+
return HasREX;
}
@@ -1347,7 +1252,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
// Emit the operand size opcode prefix as needed.
if ((TSFlags & X86II::OpSizeMask) ==
- (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
+ (STI.hasFeature(X86::Is16Bit) ? X86II::OpSize32 : X86II::OpSize16))
emitByte(0x66, OS);
// Emit the LOCK opcode prefix.
@@ -1371,9 +1276,9 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
}
// Handle REX prefix.
- assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+ assert((STI.hasFeature(X86::Is64Bit) || !(TSFlags & X86II::REX_W)) &&
"REX.W requires 64bit mode.");
- bool HasREX = STI.hasFeature(X86::Mode64Bit)
+ bool HasREX = STI.hasFeature(X86::Is64Bit)
? emitREXPrefix(MemOperand, MI, STI, OS)
: false;
@@ -1472,7 +1377,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::RawFrm:
emitByte(BaseOpcode + OpcodeOffset, OS);
- if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
+ if (!STI.hasFeature(X86::Is64Bit) || !isPCRel32Branch(MI, MCII))
break;
const MCOperand &Op = MI.getOperand(CurOp++);
@@ -1842,7 +1747,6 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx) {
return new X86MCCodeEmitter(MCII, Ctx);
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
index 532fecd9951b..cd2baeb1c98e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h
@@ -18,6 +18,7 @@
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
namespace llvm {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 8913e405539e..49660883ad83 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -72,6 +72,97 @@ bool X86_MC::hasLockPrefix(const MCInst &MI) {
return MI.getFlags() & X86::IP_HAS_LOCK;
}
+static bool isMemOperand(const MCInst &MI, unsigned Op, unsigned RegClassID) {
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+ const MCRegisterClass &RC = X86MCRegisterClasses[RegClassID];
+
+ return (Base.isReg() && Base.getReg() != 0 && RC.contains(Base.getReg())) ||
+ (Index.isReg() && Index.getReg() != 0 && RC.contains(Index.getReg()));
+}
+
+bool X86_MC::is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI) {
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+
+ if (STI.hasFeature(X86::Is16Bit) && Base.isReg() && Base.getReg() == 0 &&
+ Index.isReg() && Index.getReg() == 0)
+ return true;
+ return isMemOperand(MI, Op, X86::GR16RegClassID);
+}
+
+bool X86_MC::is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+ if (Base.isReg() && Base.getReg() == X86::EIP) {
+ assert(Index.isReg() && Index.getReg() == 0 && "Invalid eip-based address");
+ return true;
+ }
+ if (Index.isReg() && Index.getReg() == X86::EIZ)
+ return true;
+ return isMemOperand(MI, Op, X86::GR32RegClassID);
+}
+
+#ifndef NDEBUG
+bool X86_MC::is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ return isMemOperand(MI, Op, X86::GR64RegClassID);
+}
+#endif
+
+bool X86_MC::needsAddressSizeOverride(const MCInst &MI,
+ const MCSubtargetInfo &STI,
+ int MemoryOperand, uint64_t TSFlags) {
+ uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+ bool Is16BitMode = STI.hasFeature(X86::Is16Bit);
+ bool Is32BitMode = STI.hasFeature(X86::Is32Bit);
+ bool Is64BitMode = STI.hasFeature(X86::Is64Bit);
+ if ((Is16BitMode && AdSize == X86II::AdSize32) ||
+ (Is32BitMode && AdSize == X86II::AdSize16) ||
+ (Is64BitMode && AdSize == X86II::AdSize32))
+ return true;
+ uint64_t Form = TSFlags & X86II::FormMask;
+ switch (Form) {
+ default:
+ break;
+ case X86II::RawFrmDstSrc: {
+ unsigned siReg = MI.getOperand(1).getReg();
+ assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+ (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+ (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+ "SI and DI register sizes do not match");
+ return (!Is32BitMode && siReg == X86::ESI) ||
+ (Is32BitMode && siReg == X86::SI);
+ }
+ case X86II::RawFrmSrc: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ return (!Is32BitMode && siReg == X86::ESI) ||
+ (Is32BitMode && siReg == X86::SI);
+ }
+ case X86II::RawFrmDst: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ return (!Is32BitMode && siReg == X86::EDI) ||
+ (Is32BitMode && siReg == X86::DI);
+ }
+ }
+
+ // Determine where the memory operand starts, if present.
+ if (MemoryOperand < 0)
+ return false;
+
+ if (STI.hasFeature(X86::Is64Bit)) {
+ assert(!is16BitMemOperand(MI, MemoryOperand, STI));
+ return is32BitMemOperand(MI, MemoryOperand);
+ }
+ if (STI.hasFeature(X86::Is32Bit)) {
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ return is16BitMemOperand(MI, MemoryOperand, STI);
+ }
+ assert(STI.hasFeature(X86::Is16Bit));
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ return !is16BitMemOperand(MI, MemoryOperand, STI);
+}
+
void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
// FIXME: TableGen these.
for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 35604cd3ec0a..d0530bd4d650 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -63,6 +63,28 @@ void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
/// Returns true if this instruction has a LOCK prefix.
bool hasLockPrefix(const MCInst &MI);
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 16-bit memory operand.
+bool is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI);
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 32-bit memory operand.
+bool is32BitMemOperand(const MCInst &MI, unsigned Op);
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 64-bit memory operand.
+#ifndef NDEBUG
+bool is64BitMemOperand(const MCInst &MI, unsigned Op);
+#endif
+
+/// Returns true if this instruction needs an Address-Size override prefix.
+bool needsAddressSizeOverride(const MCInst &MI, const MCSubtargetInfo &STI,
+ int MemoryOperand, uint64_t TSFlags);
+
/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
/// do not need to go through TargetRegistry.
MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -70,7 +92,6 @@ MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
}
MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
- const MCRegisterInfo &MRI,
MCContext &Ctx);
MCAsmBackend *createX86_32AsmBackend(const Target &T,
@@ -142,4 +163,7 @@ MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
#define GET_SUBTARGETINFO_ENUM
#include "X86GenSubtargetInfo.inc"
+#define GET_X86_MNEMONIC_TABLES_H
+#include "X86GenMnemonicTables.inc"
+
#endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp
new file mode 100644
index 000000000000..39b7f0f4160e
--- /dev/null
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MnemonicTables.cpp
@@ -0,0 +1,16 @@
+//===-- X86MnemonicTables.cpp - X86 Mnemonic Tables -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 mnemonic tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+
+#define GET_X86_MNEMONIC_TABLES_CPP
+#include "X86GenMnemonicTables.inc"
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index c29211246123..36945d1f6746 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -9,6 +9,7 @@
#include "X86MCTargetDesc.h"
#include "X86TargetStreamer.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCWin64EH.h"
@@ -25,15 +26,15 @@ public:
std::unique_ptr<MCObjectWriter> OW)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
- void EmitWinEHHandlerData(SMLoc Loc) override;
- void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
- void EmitWindowsUnwindTables() override;
- void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
+ void emitWinEHHandlerData(SMLoc Loc) override;
+ void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+ void emitWindowsUnwindTables() override;
+ void emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
void finishImpl() override;
};
-void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
- MCStreamer::EmitWinEHHandlerData(Loc);
+void X86WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) {
+ MCStreamer::emitWinEHHandlerData(Loc);
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section.
@@ -41,17 +42,17 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
}
-void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+void X86WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
}
-void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+void X86WinCOFFStreamer::emitWindowsUnwindTables() {
if (!getNumWinFrameInfos())
return;
EHStreamer.Emit(*this);
}
-void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
+void X86WinCOFFStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
X86TargetStreamer *XTS =
static_cast<X86TargetStreamer *>(getTargetStreamer());
XTS->emitFPOData(ProcSym, Loc);
@@ -59,7 +60,7 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
void X86WinCOFFStreamer::finishImpl() {
emitFrames(nullptr);
- EmitWindowsUnwindTables();
+ emitWindowsUnwindTables();
MCWinCOFFStreamer::finishImpl();
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index bf3f4e990ecc..f2827c568109 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/FormattedStream.h"
using namespace llvm;
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 10e1c5d6ed38..7344900f2e31 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -79,6 +79,9 @@ FunctionPass *createX86DynAllocaExpander();
/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();
+/// Return a pass that preconfig the tile registers before fast reg allocation.
+FunctionPass *createX86FastPreTileConfigPass();
+
/// Return a pass that config the tile registers after fast reg allocation.
FunctionPass *createX86FastTileConfigPass();
@@ -175,6 +178,7 @@ void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86FastPreTileConfigPass(PassRegistry &);
void initializeX86FastTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 380507308c3d..a5c6b40c493c 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -18,13 +18,13 @@ include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
// X86 Subtarget state
//
-
-def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
- "64-bit mode (x86_64)">;
-def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
- "32-bit mode (80386)">;
-def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
- "16-bit mode (i8086)">;
+// disregarding specific ABI / programming model
+def Is64Bit : SubtargetFeature<"64bit-mode", "Is64Bit", "true",
+ "64-bit mode (x86_64)">;
+def Is32Bit : SubtargetFeature<"32bit-mode", "Is32Bit", "true",
+ "32-bit mode (80386)">;
+def Is16Bit : SubtargetFeature<"16bit-mode", "Is16Bit", "true",
+ "16-bit mode (i8086)">;
//===----------------------------------------------------------------------===//
// X86 Subtarget ISA features
@@ -34,16 +34,16 @@ def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
"Enable X87 float instructions">;
def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
- "Enable NOPL instruction">;
+ "Enable NOPL instruction (generally pentium pro+)">;
-def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMOV", "true",
"Enable conditional move instructions">;
-def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
- "Support CMPXCHG8B instructions">;
+def FeatureCX8 : SubtargetFeature<"cx8", "HasCX8", "true",
+ "Support CMPXCHG8B instructions">;
def FeatureCRC32 : SubtargetFeature<"crc32", "HasCRC32", "true",
- "Enable SSE 4.2 CRC32 instruction">;
+ "Enable SSE 4.2 CRC32 instruction (used when SSE4.2 is supported but function is GPR only)">;
def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
"Support POPCNT instruction">;
@@ -98,11 +98,11 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
// without disabling 64-bit mode. Nothing should imply this feature bit. It
// is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
-def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+def FeatureX86_64 : SubtargetFeature<"64bit", "HasX86_64", "true",
"Support 64-bit instructions">;
-def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
- "64-bit with cmpxchg16b",
- [FeatureCMPXCHG8B]>;
+def FeatureCX16 : SubtargetFeature<"cx16", "HasCX16", "true",
+ "64-bit with cmpxchg16b (this is true for most x86-64 chips, but not the first AMD chips)",
+ [FeatureCX8]>;
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
"Support SSE 4a instructions",
[FeatureSSE3]>;
@@ -119,7 +119,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
"Support 16-bit floating point conversion instructions",
[FeatureAVX]>;
-def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
"Enable AVX-512 instructions",
[FeatureAVX2, FeatureFMA, FeatureF16C]>;
def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
@@ -198,7 +198,7 @@ def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
[FeatureFMA4]>;
def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
"HasSSEUnalignedMem", "true",
- "Allow unaligned memory operands with SSE instructions">;
+ "Allow unaligned memory operands with SSE instructions (this may require setting a configuration bit in the processor)">;
def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
"Enable AES instructions",
[FeatureSSE2]>;
@@ -228,20 +228,22 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
"Enable SHA instructions",
[FeatureSSE2]>;
+// Processor supports CET SHSTK - Control-Flow Enforcement Technology
+// using Shadow Stack
def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
"Support CET Shadow-Stack instructions">;
def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
"Support RDSEED instruction">;
-def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+def FeatureLAHFSAHF64 : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
"Support LAHF and SAHF instructions in 64-bit mode">;
def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
"Enable MONITORX/MWAITX timer functionality">;
def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
"Enable Cache Line Zero">;
def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
- "Enable Cache Demote">;
+ "Enable Cache Line Demote">;
def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
"Support ptwrite instruction">;
def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
@@ -285,9 +287,9 @@ def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
"platform configuration instruction">;
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
- "Support movdiri instruction">;
+ "Support movdiri instruction (direct store integer)">;
def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
- "Support movdir64b instruction">;
+ "Support movdir64b instruction (direct store 64 bytes)">;
// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
// "string operations"). See "REP String Enhancement" in the Intel Software
@@ -380,6 +382,17 @@ def FeatureTaggedGlobals
"Use an instruction sequence for taking the address of a global "
"that allows a memory tag in the upper address bits.">;
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+def FeatureHardenSlsRet
+ : SubtargetFeature<
+ "harden-sls-ret", "HardenSlsRet", "true",
+ "Harden against straight line speculation across RET instructions.">;
+
+def FeatureHardenSlsIJmp
+ : SubtargetFeature<
+ "harden-sls-ijmp", "HardenSlsIJmp", "true",
+ "Harden against straight line speculation across indirect JMP instructions.">;
+
//===----------------------------------------------------------------------===//
// X86 Subtarget Tuning features
//===----------------------------------------------------------------------===//
@@ -388,7 +401,7 @@ def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
- "PMULLD instruction is slow">;
+ "PMULLD instruction is slow (compared to PMULLW/PMULHW and PMULUDQ)">;
def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
"true",
@@ -396,27 +409,31 @@ def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
// FIXME: This should not apply to CPUs that do not have SSE.
def TuningSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
- "IsUAMem16Slow", "true",
+ "IsUnalignedMem16Slow", "true",
"Slow unaligned 16-byte memory access">;
def TuningSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
- "IsUAMem32Slow", "true",
+ "IsUnalignedMem32Slow", "true",
"Slow unaligned 32-byte memory access">;
def TuningLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
- "Use LEA for adjusting the stack pointer">;
+ "Use LEA for adjusting the stack pointer (this is an optimization for Intel Atom processors)">;
+// True if 8-bit divisions are significantly faster than
+// 32-bit divisions and should be used when possible.
def TuningSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
+// True if 32-bit divides are significantly faster than
+// 64-bit divisions and should be used when possible.
def TuningSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
"Use 32-bit divide for positive values less than 2^32">;
def TuningPadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
- "Pad short functions">;
+ "Pad short functions (to prevent a stall when returning too early)">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
@@ -425,15 +442,21 @@ def TuningSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
"SlowTwoMemOps", "true",
"Two memory operand instructions are slow">;
-def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+// True if the LEA instruction inputs have to be ready at address generation
+// (AG) time.
+def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LeaUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def TuningSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
+// True if the LEA instruction has all three source operands: base, index,
+// and offset or if the LEA instruction uses base and index registers where
+// the base is EBP, RBP,or R13
def TuningSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
"LEA instruction with 3 ops or certain registers is slow">;
+// True if INC and DEC instructions are slow when writing to flags
def TuningSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
@@ -445,6 +468,31 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;
+def TuningMULCFalseDeps : SubtargetFeature<"false-deps-mulc",
+ "HasMULCFalseDeps", "true",
+ "VF[C]MULCPH/SH has a false dependency on dest register">;
+
+def TuningPERMFalseDeps : SubtargetFeature<"false-deps-perm",
+ "HasPERMFalseDeps", "true",
+ "VPERMD/Q/PS/PD has a false dependency on dest register">;
+
+def TuningRANGEFalseDeps : SubtargetFeature<"false-deps-range",
+ "HasRANGEFalseDeps", "true",
+ "VRANGEPD/PS/SD/SS has a false dependency on dest register">;
+
+def TuningGETMANTFalseDeps : SubtargetFeature<"false-deps-getmant",
+ "HasGETMANTFalseDeps", "true",
+ "VGETMANTSS/SD/SH and VGETMANDPS/PD(memory version) has a"
+ " false dependency on dest register">;
+
+def TuningMULLQFalseDeps : SubtargetFeature<"false-deps-mullq",
+ "HasMULLQFalseDeps", "true",
+ "VPMULLQ has a false dependency on dest register">;
+
+def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
+ "HasSBBDepBreaking", "true",
+ "SBB with same register has no source dependency">;
+
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def TuningFastVariableCrossLaneShuffle
@@ -470,9 +518,14 @@ def TuningInsertVZEROUPPER
// vectorized code we should care about the throughput of SQRT operations.
// But if the code is scalar that probably means that the code has some kind of
// dependency and we should care more about reducing the latency.
+
+// True if hardware SQRTSS instruction is at least as fast (latency) as
+// RSQRTSS followed by a Newton-Raphson iteration.
def TuningFastScalarFSQRT
: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
def TuningFastVectorFSQRT
: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
"true", "Vector SQRT is fast (disable Newton-Raphson)">;
@@ -529,7 +582,7 @@ def TuningMacroFusion
// similar to Skylake Server (AVX-512).
def TuningFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
- "Indicates if gather is reasonably fast">;
+ "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
def TuningPrefer128Bit
: SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
@@ -578,17 +631,13 @@ def TuningUseGLMDivSqrtCosts
: SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
"Use Goldmont specific floating point div/sqrt costs">;
-// Enable use of alias analysis during code generation.
-def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
- "Use alias analysis during codegen">;
-
//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
//===----------------------------------------------------------------------===//
// Bonnell
-def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
+def ProcIntelAtom : SubtargetFeature<"", "IsAtom", "true", "Is Intel Atom processor">;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -632,11 +681,11 @@ include "X86SchedIceLake.td"
def ProcessorFeatures {
// x86-64 and x86-64-v[234]
list<SubtargetFeature> X86_64V1Features = [
- FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, Feature64Bit
+ FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureX86_64,
];
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
- FeatureCMPXCHG16B, FeatureLAHFSAHF, FeatureCRC32, FeaturePOPCNT,
+ FeatureCX16, FeatureLAHFSAHF64, FeatureCRC32, FeaturePOPCNT,
FeatureSSE42
]);
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -862,22 +911,27 @@ def ProcessorFeatures {
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureUINTR];
- list<SubtargetFeature> SPRTuning = ICXTuning;
+ list<SubtargetFeature> SPRAdditionalTuning = [TuningMULCFalseDeps,
+ TuningPERMFalseDeps,
+ TuningRANGEFalseDeps,
+ TuningGETMANTFalseDeps,
+ TuningMULLQFalseDeps];
+ list<SubtargetFeature> SPRTuning = !listconcat(ICXTuning, SPRAdditionalTuning);
list<SubtargetFeature> SPRFeatures =
!listconcat(ICXFeatures, SPRAdditionalFeatures);
// Atom
list<SubtargetFeature> AtomFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
FeatureMOVBE,
- FeatureLAHFSAHF];
+ FeatureLAHFSAHF64];
list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
TuningSlowUAMem16,
TuningLEAForSP,
@@ -968,25 +1022,26 @@ def ProcessorFeatures {
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureWAITPKG];
- list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+ list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
list<SubtargetFeature> ADLFeatures =
!listconcat(TRMFeatures, ADLAdditionalFeatures);
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
FeatureCRC32,
FeaturePOPCNT,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF,
+ FeatureLAHFSAHF64,
FeatureAES,
FeatureRDRAND,
FeatureF16C,
@@ -1018,41 +1073,43 @@ def ProcessorFeatures {
// Barcelona
list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureSSE4A,
Feature3DNowA,
FeatureFXSR,
FeatureNOPL,
- FeatureCMPXCHG16B,
+ FeatureCX16,
FeaturePRFCHW,
FeatureLZCNT,
FeaturePOPCNT,
- FeatureLAHFSAHF,
+ FeatureLAHFSAHF64,
FeatureCMOV,
- Feature64Bit];
+ FeatureX86_64];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// Bobcat
list<SubtargetFeature> BtVer1Features = [FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureSSE4A,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
FeaturePRFCHW,
FeatureLZCNT,
FeaturePOPCNT,
- FeatureLAHFSAHF];
+ FeatureLAHFSAHF64];
list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// Jaguar
@@ -1072,17 +1129,18 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningFastMOVBE,
+ TuningSBBDepBreaking,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
// Bulldozer
list<SubtargetFeature> BdVer1Features = [FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureXOP,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
FeatureAES,
FeatureCRC32,
FeaturePRFCHW,
@@ -1094,11 +1152,12 @@ def ProcessorFeatures {
FeaturePOPCNT,
FeatureXSAVE,
FeatureLWP,
- FeatureLAHFSAHF];
+ FeatureLAHFSAHF64];
list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// PileDriver
@@ -1140,15 +1199,15 @@ def ProcessorFeatures {
FeatureCLFLUSHOPT,
FeatureCLZERO,
FeatureCMOV,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
FeatureCRC32,
FeatureF16C,
FeatureFMA,
FeatureFSGSBase,
FeatureFXSR,
FeatureNOPL,
- FeatureLAHFSAHF,
+ FeatureLAHFSAHF64,
FeatureLZCNT,
FeatureMMX,
FeatureMOVBE,
@@ -1169,9 +1228,13 @@ def ProcessorFeatures {
TuningFastBEXTR,
TuningFast15ByteNOP,
TuningBranchFusion,
+ TuningFastScalarFSQRT,
+ TuningFastVectorFSQRT,
TuningFastScalarShiftMasks,
+ TuningFastVariablePerLaneShuffle,
TuningFastMOVBE,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
@@ -1184,11 +1247,9 @@ def ProcessorFeatures {
FeaturePKU,
FeatureVAES,
FeatureVPCLMULQDQ];
- list<SubtargetFeature> ZN3AdditionalTuning =
- [TuningMacroFusion,
- TuningFastVariablePerLaneShuffle];
+ list<SubtargetFeature> ZN3AdditionalTuning = [TuningMacroFusion];
list<SubtargetFeature> ZN3Tuning =
- !listconcat(ZNTuning, ZN3AdditionalTuning);
+ !listconcat(ZN2Tuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
!listconcat(ZN2Features, ZN3AdditionalFeatures);
}
@@ -1209,39 +1270,43 @@ class ProcModel<string Name, SchedMachineModel Model,
// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
// if i386/i486 is specifically requested.
// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
-// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
-// It has no effect on code generation.
+// constructor checks that any CPU used in 64-bit mode has FeatureX86_64
+// enabled. It has no effect on code generation.
+// NOTE: As a default tuning, "generic" aims to produce code optimized for the
+// most common X86 processors. The tunings might be changed over time. It is
+// recommended to use "x86-64" in lit tests for consistency.
def : ProcModel<"generic", SandyBridgeModel,
- [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+ [FeatureX87, FeatureCX8, FeatureX86_64],
[TuningSlow3OpsLEA,
TuningSlowDivide64,
- TuningSlowIncDec,
TuningMacroFusion,
+ TuningFastScalarFSQRT,
+ TuningFast15ByteNOP,
TuningInsertVZEROUPPER]>;
def : Proc<"i386", [FeatureX87],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"i486", [FeatureX87],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
+def : Proc<"i586", [FeatureX87, FeatureCX8],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
+def : Proc<"pentium", [FeatureX87, FeatureCX8],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+def : Proc<"pentium-mmx", [FeatureX87, FeatureCX8, FeatureMMX],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+def : Proc<"i686", [FeatureX87, FeatureCX8, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+def : Proc<"pentiumpro", [FeatureX87, FeatureCX8, FeatureCMOV,
FeatureNOPL],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+def : Proc<"pentium2", [FeatureX87, FeatureCX8, FeatureMMX, FeatureCMOV,
FeatureFXSR, FeatureNOPL],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["pentium3", "pentium3m"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ def : Proc<P, [FeatureX87, FeatureCX8, FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
@@ -1257,42 +1322,42 @@ foreach P = ["pentium3", "pentium3m"] in {
// changes slightly.
def : ProcModel<"pentium-m", GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE2,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcModel<P, GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE2,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
// Intel Quark.
-def : Proc<"lakemont", [FeatureCMPXCHG8B],
+def : Proc<"lakemont", [FeatureCX8],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
// Intel Core Duo.
def : ProcModel<"yonah", SandyBridgeModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
// NetBurst.
def : ProcModel<"prescott", GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureNOPL, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : ProcModel<"nocona", GenericPostRAModel, [
FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureSSE3,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
+ FeatureX86_64,
+ FeatureCX16,
],
[
TuningSlowUAMem16,
@@ -1302,15 +1367,15 @@ def : ProcModel<"nocona", GenericPostRAModel, [
// Intel Core 2 Solo/Duo.
def : ProcModel<"core2", SandyBridgeModel, [
FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureLAHFSAHF
+ FeatureX86_64,
+ FeatureCX16,
+ FeatureLAHFSAHF64
],
[
TuningMacroFusion,
@@ -1319,15 +1384,15 @@ def : ProcModel<"core2", SandyBridgeModel, [
]>;
def : ProcModel<"penryn", SandyBridgeModel, [
FeatureX87,
- FeatureCMPXCHG8B,
+ FeatureCX8,
FeatureCMOV,
FeatureMMX,
FeatureSSE41,
FeatureFXSR,
FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureLAHFSAHF
+ FeatureX86_64,
+ FeatureCX16,
+ FeatureLAHFSAHF64
],
[
TuningMacroFusion,
@@ -1416,38 +1481,38 @@ def : ProcModel<"alderlake", SkylakeClientModel,
// AMD CPUs.
-def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+def : Proc<"k6", [FeatureX87, FeatureCX8, FeatureMMX],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+def : Proc<"k6-2", [FeatureX87, FeatureCX8, Feature3DNow],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+def : Proc<"k6-3", [FeatureX87, FeatureCX8, Feature3DNow],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+ def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, Feature3DNowA,
FeatureNOPL],
[TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
[TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+ def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCMOV],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
- TuningInsertVZEROUPPER]>;
+ TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
- Feature64Bit],
+ def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCX16, FeatureCMOV,
+ FeatureX86_64],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
- TuningInsertVZEROUPPER]>;
+ TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
@@ -1482,7 +1547,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
ProcessorFeatures.ZN3Tuning>;
-def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+def : Proc<"geode", [FeatureX87, FeatureCX8, Feature3DNowA],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
@@ -1491,7 +1556,7 @@ def : Proc<"winchip2", [FeatureX87, Feature3DNow],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
def : Proc<"c3", [FeatureX87, Feature3DNow],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+def : Proc<"c3-2", [FeatureX87, FeatureCX8, FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureCMOV],
[TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index d48b8e458219..c205395aa084 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -29,6 +29,7 @@
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -60,8 +61,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SMShadowTracker.startFunction(MF);
CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
- *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(),
- MF.getContext()));
+ *Subtarget->getInstrInfo(), MF.getContext()));
EmitFPOData =
Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();
@@ -70,12 +70,12 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (Subtarget->isTargetCOFF()) {
bool Local = MF.getFunction().hasLocalLinkage();
- OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
- OutStreamer->EmitCOFFSymbolStorageClass(
+ OutStreamer->beginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer->emitCOFFSymbolStorageClass(
Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
- OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
- << COFF::SCT_COMPLEX_TYPE_SHIFT);
- OutStreamer->EndCOFFSymbolDef();
+ OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+ << COFF::SCT_COMPLEX_TYPE_SHIFT);
+ OutStreamer->endCOFFSymbolDef();
}
// Emit the rest of the function body.
@@ -249,7 +249,7 @@ void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
raw_ostream &O, const char *Modifier) {
const MachineOperand &MO = MI->getOperand(OpNo);
- if (!Modifier || MO.getType() != MachineOperand::MO_Register)
+ if (!Modifier || !MO.isReg())
return PrintOperand(MI, OpNo, O);
if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
O << '%';
@@ -336,6 +336,37 @@ void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo,
}
}
+static bool isSimpleReturn(const MachineInstr &MI) {
+ // We exclude all tail calls here which set both isReturn and isCall.
+ return MI.getDesc().isReturn() && !MI.getDesc().isCall();
+}
+
+static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ ||
+ Opc == X86::TAILJMPr || Opc == X86::TAILJMPm ||
+ Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
+ Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
+ Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
+ Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX;
+}
+
+void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
+ if (Subtarget->hardenSlsRet() || Subtarget->hardenSlsIJmp()) {
+ auto I = MBB.getLastNonDebugInstr();
+ if (I != MBB.end()) {
+ if ((Subtarget->hardenSlsRet() && isSimpleReturn(*I)) ||
+ (Subtarget->hardenSlsIJmp() && isIndirectBranchOrTailCall(*I))) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::INT3);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ }
+ }
+ }
+ AsmPrinter::emitBasicBlockEnd(MBB);
+ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+}
+
void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
raw_ostream &O, const char *Modifier) {
assert(isMem(*MI, OpNo) && "Invalid memory reference!");
@@ -363,6 +394,12 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
BaseReg.getReg() == X86::RIP)
HasBaseReg = false;
+ // If we really just want to print out displacement.
+ if (Modifier && (DispSpec.isGlobal() || DispSpec.isSymbol()) &&
+ !strcmp(Modifier, "disp-only")) {
+ HasBaseReg = false;
+ }
+
// If this has a segment register, print it.
if (SegReg.getReg()) {
PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
@@ -606,11 +643,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
PrintMemReference(MI, OpNo, O, "H");
}
return false;
- case 'P': // Don't print @PLT, but do print as memory.
+ // Print memory only with displacement. The Modifer 'P' is used in inline
+ // asm to present a call symbol or a global symbol which can not use base
+ // reg or index reg.
+ case 'P':
if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
- PrintIntelMemReference(MI, OpNo, O, "no-rip");
+ PrintIntelMemReference(MI, OpNo, O, "disp-only");
} else {
- PrintMemReference(MI, OpNo, O, "no-rip");
+ PrintMemReference(MI, OpNo, O, "disp-only");
}
return false;
}
@@ -641,7 +681,7 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
MCSection *Cur = OutStreamer->getCurrentSectionOnly();
MCSection *Nt = MMI->getContext().getELFSection(
".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
- OutStreamer->SwitchSection(Nt);
+ OutStreamer->switchSection(Nt);
// Emitting note header.
const int WordSize = TT.isArch64Bit() && !TT.isX32() ? 8 : 4;
@@ -658,21 +698,21 @@ void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
OutStreamer->endSection(Nt);
- OutStreamer->SwitchSection(Cur);
+ OutStreamer->switchSection(Cur);
}
}
if (TT.isOSBinFormatMachO())
- OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ OutStreamer->switchSection(getObjFileLowering().getTextSection());
if (TT.isOSBinFormatCOFF()) {
// Emit an absolute @feat.00 symbol. This appears to be some kind of
// compiler features bitfield read by link.exe.
MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
- OutStreamer->BeginCOFFSymbolDef(S);
- OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
- OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
- OutStreamer->EndCOFFSymbolDef();
+ OutStreamer->beginCOFFSymbolDef(S);
+ OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+ OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+ OutStreamer->endCOFFSymbolDef();
int64_t Feat00Flags = 0;
if (TT.getArch() == Triple::x86) {
@@ -739,7 +779,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
// Output stubs for external and common global variables.
Stubs = MMIMacho.GetGVStubList();
if (!Stubs.empty()) {
- OutStreamer.SwitchSection(MMI->getContext().getMachOSection(
+ OutStreamer.switchSection(MMI->getContext().getMachOSection(
"__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
SectionKind::getMetadata()));
@@ -747,7 +787,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
Stubs.clear();
- OutStreamer.AddBlankLine();
+ OutStreamer.addBlankLine();
}
}
@@ -795,6 +835,22 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
emitStackMaps(SM);
FM.serializeToFaultMapSection();
}
+
+ // Emit __morestack address if needed for indirect calls.
+ if (TT.getArch() == Triple::x86_64 && TM.getCodeModel() == CodeModel::Large) {
+ if (MCSymbol *AddrSymbol = OutContext.lookupSymbol("__morestack_addr")) {
+ Align Alignment(1);
+ MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
+ getDataLayout(), SectionKind::getReadOnly(),
+ /*C=*/nullptr, Alignment);
+ OutStreamer->switchSection(ReadOnlySection);
+ OutStreamer->emitLabel(AddrSymbol);
+
+ unsigned PtrSize = MAI->getCodePointerSize();
+ OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+ PtrSize);
+ }
+ }
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 94679e6e3d11..d53c26b729ef 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -131,10 +131,7 @@ public:
void emitInstruction(const MachineInstr *MI) override;
- void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {
- AsmPrinter::emitBasicBlockEnd(MBB);
- SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
- }
+ void emitBasicBlockEnd(const MachineBasicBlock &MBB) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) override;
diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index 0899783d5f60..2ecf49382d29 100644
--- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -35,6 +35,7 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#define AVOIDCALL_DESC "X86 avoid trailing call pass"
@@ -69,8 +70,8 @@ INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false,
// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
// expand to nothing, and some expand to code. This logic conservatively assumes
// they might expand to nothing.
-static bool isRealInstruction(MachineInstr &MI) {
- return !MI.isPseudo() && !MI.isMetaInstruction();
+static bool isCallOrRealInstruction(MachineInstr &MI) {
+ return MI.isCall() || (!MI.isPseudo() && !MI.isMetaInstruction());
}
// Return true if this is a call instruction, but not a tail call.
@@ -100,7 +101,7 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
continue;
// Find the last real instruction in this block.
- auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction);
+ auto LastRealInstr = llvm::find_if(reverse(MBB), isCallOrRealInstruction);
// If the block is empty or the last real instruction is a call instruction,
// insert an int3. If there is a call instruction, insert the int3 between
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index c80a5d5bb332..ded93fdc011c 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -299,7 +299,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
const MachineFunction &MF = State.getMachineFunction();
size_t ArgCount = State.getMachineFunction().getFunction().arg_size();
- bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit();
+ bool Is64Bit = MF.getSubtarget<X86Subtarget>().is64Bit();
unsigned SlotSize = Is64Bit ? 8 : 4;
unsigned Offset;
if (ArgCount == 1 && ValNo == 0) {
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index 96d3d1390a59..f32891552a82 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -97,6 +97,11 @@ static cl::opt<bool> ForceMemOperand(
cl::desc("Convert cmovs to branches whenever they have memory operands."),
cl::init(true), cl::Hidden);
+static cl::opt<bool> ForceAll(
+ "x86-cmov-converter-force-all",
+ cl::desc("Convert all cmovs to branches."),
+ cl::init(false), cl::Hidden);
+
namespace {
/// Converts X86 cmov instructions into branches when profitable.
@@ -174,11 +179,11 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
TSchedModel.init(&STI);
// Before we handle the more subtle cases of register-register CMOVs inside
- // of potentially hot loops, we want to quickly remove all CMOVs with
- // a memory operand. The CMOV will risk a stall waiting for the load to
- // complete that speculative execution behind a branch is better suited to
- // handle on modern x86 chips.
- if (ForceMemOperand) {
+ // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
+ // the ones with a memory operand (ForceMemOperand option). The latter CMOV
+ // will risk a stall waiting for the load to complete that speculative
+ // execution behind a branch is better suited to handle on modern x86 chips.
+ if (ForceMemOperand || ForceAll) {
CmovGroups AllCmovGroups;
SmallVector<MachineBasicBlock *, 4> Blocks;
for (auto &MBB : MF)
@@ -186,7 +191,8 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
for (auto &Group : AllCmovGroups) {
// Skip any group that doesn't do at least one memory operand cmov.
- if (llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+ if (ForceMemOperand && !ForceAll &&
+ llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
continue;
// For CMOV groups which we can rewrite and which contain a memory load,
@@ -196,12 +202,15 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
convertCmovInstsToBranches(Group);
}
}
+ // Early return as ForceAll converts all CmovGroups.
+ if (ForceAll)
+ return Changed;
}
//===--------------------------------------------------------------------===//
// Register-operand Conversion Algorithm
// ---------
- // For each inner most loop
+ // For each innermost loop
// collectCmovCandidates() {
// Find all CMOV-group-candidates.
// }
@@ -230,7 +239,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
Loops.push_back(Child);
for (MachineLoop *CurrLoop : Loops) {
- // Optimize only inner most loops.
+ // Optimize only innermost loops.
if (!CurrLoop->getSubLoops().empty())
continue;
@@ -520,7 +529,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
//===--------------------------------------------------------------------===//
// Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
// Worth-Optimize-Group:
- // Iff it worths to optimize all CMOV instructions in the group.
+ // Iff it is worth to optimize all CMOV instructions in the group.
//
// Worth-Optimize-CMOV:
// Predicted branch is faster than CMOV by the difference between depth of
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
index 2ff8ee19561b..29668f4b2761 100644
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -16,6 +16,7 @@
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/ProfileData/SampleProf.h"
@@ -159,7 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
}
// Since we were able to encode, bump the MemOpDiscriminators.
++MemOpDiscriminators[L];
- DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue());
+ DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
assert(DI && "DI should not be nullptr");
updateDebugInfo(&MI, DI);
Changed = true;
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 9826bf4bf861..9d4338deca35 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -15,6 +15,7 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/STLExtras.h"
@@ -86,7 +87,7 @@ protected:
public:
InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {}
- virtual ~InstrConverterBase() {}
+ virtual ~InstrConverterBase() = default;
/// \returns true if \p MI is legal to convert.
virtual bool isLegal(const MachineInstr *MI,
@@ -374,7 +375,7 @@ class X86DomainReassignment : public MachineFunctionPass {
const X86InstrInfo *TII = nullptr;
/// All edges that are included in some closure
- DenseSet<unsigned> EnclosedEdges;
+ BitVector EnclosedEdges{8, false};
/// All instructions that are included in some closure.
DenseMap<MachineInstr *, unsigned> EnclosedInstrs;
@@ -429,10 +430,10 @@ char X86DomainReassignment::ID = 0;
void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
RegDomain &Domain,
SmallVectorImpl<unsigned> &Worklist) {
- if (EnclosedEdges.count(Reg))
+ if (!Reg.isVirtual())
return;
- if (!Reg.isVirtual())
+ if (EnclosedEdges.test(Register::virtReg2Index(Reg)))
return;
if (!MRI->hasOneDef(Reg))
@@ -550,7 +551,7 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
// Register already in this closure.
if (!C.insertEdge(CurReg))
continue;
- EnclosedEdges.insert(Reg);
+ EnclosedEdges.set(Register::virtReg2Index(Reg));
MachineInstr *DefMI = MRI->getVRegDef(CurReg);
encloseInstr(C, DefMI);
@@ -742,6 +743,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
EnclosedEdges.clear();
+ EnclosedEdges.resize(MRI->getNumVirtRegs());
EnclosedInstrs.clear();
std::vector<Closure> Closures;
@@ -756,7 +758,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
continue;
// Register already in closure.
- if (EnclosedEdges.count(Reg))
+ if (EnclosedEdges.test(Idx))
continue;
// Calculate closure starting with Reg.
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 6a047838f0b5..aebeec5a6d27 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,6 +19,7 @@
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
@@ -552,7 +553,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::PTILELOADDV:
case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
unsigned Opc =
Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
MI.setDesc(TII->get(Opc));
@@ -565,7 +566,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::PTDPBF16PSV: {
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
unsigned Opc;
switch (Opcode) {
case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break;
@@ -581,13 +582,13 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
}
case X86::PTILESTOREDV: {
for (int i = 1; i >= 0; --i)
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
return true;
}
case X86::PTILEZEROV: {
for (int i = 2; i > 0; --i) // Remove row, col
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
MI.setDesc(TII->get(X86::TILEZERO));
return true;
}
@@ -729,7 +730,7 @@ bool X86ExpandPseudo::ExpandPseudosWhichAffectControlFlow(MachineFunction &MF) {
}
bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
- STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+ STI = &MF.getSubtarget<X86Subtarget>();
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
X86FI = MF.getInfo<X86MachineFunctionInfo>();
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 1ac998b7ff7e..f2c362eeaa48 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -49,22 +49,11 @@ class X86FastISel final : public FastISel {
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
- /// floating point ops.
- /// When SSE is available, use it for f32 operations.
- /// When SSE2 is available, use it for f64 operations.
- bool X86ScalarSSEf64;
- bool X86ScalarSSEf32;
- bool X86ScalarSSEf16;
-
public:
explicit X86FastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo)
: FastISel(funcInfo, libInfo) {
Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
- X86ScalarSSEf64 = Subtarget->hasSSE2();
- X86ScalarSSEf32 = Subtarget->hasSSE1();
- X86ScalarSSEf16 = Subtarget->hasFP16();
}
bool fastSelectInstruction(const Instruction *I) override;
@@ -158,9 +147,8 @@ private:
/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
/// computed in an SSE register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
- return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
- (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
- (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
+ return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
+ (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
}
bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@@ -292,6 +280,11 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
return false;
+ // Make sure there are no potentially eflags clobbering constant
+ // materializations in between.
+ if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))
+ return false;
+
CC = TmpCC;
return true;
}
@@ -305,9 +298,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
VT = evt.getSimpleVT();
// For now, require SSE/SSE2 for performing floating-point operations,
// since x87 requires additional work.
- if (VT == MVT::f64 && !X86ScalarSSEf64)
+ if (VT == MVT::f64 && !Subtarget->hasSSE2())
return false;
- if (VT == MVT::f32 && !X86ScalarSSEf32)
+ if (VT == MVT::f32 && !Subtarget->hasSSE1())
return false;
// Similarly, no f80 support yet.
if (VT == MVT::f80)
@@ -325,6 +318,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
MachineMemOperand *MMO, unsigned &ResultReg,
unsigned Alignment) {
+ bool HasSSE1 = Subtarget->hasSSE1();
+ bool HasSSE2 = Subtarget->hasSSE2();
bool HasSSE41 = Subtarget->hasSSE41();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
@@ -354,20 +349,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
Opc = X86::MOV64rm;
break;
case MVT::f32:
- if (X86ScalarSSEf32)
- Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
- HasAVX ? X86::VMOVSSrm_alt :
- X86::MOVSSrm_alt;
- else
- Opc = X86::LD_Fp32m;
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt
+ : HasAVX ? X86::VMOVSSrm_alt
+ : HasSSE1 ? X86::MOVSSrm_alt
+ : X86::LD_Fp32m;
break;
case MVT::f64:
- if (X86ScalarSSEf64)
- Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
- HasAVX ? X86::VMOVSDrm_alt :
- X86::MOVSDrm_alt;
- else
- Opc = X86::LD_Fp64m;
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt
+ : HasAVX ? X86::VMOVSDrm_alt
+ : HasSSE2 ? X86::MOVSDrm_alt
+ : X86::LD_Fp64m;
break;
case MVT::f80:
// No f80 support yet.
@@ -521,7 +512,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
break;
case MVT::f32:
- if (X86ScalarSSEf32) {
+ if (HasSSE1) {
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSS;
else
@@ -531,7 +522,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
Opc = X86::ST_Fp32m;
break;
case MVT::f64:
- if (X86ScalarSSEf32) {
+ if (HasSSE2) {
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSD;
else
@@ -1362,8 +1353,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
bool HasAVX512 = Subtarget->hasAVX512();
bool HasAVX = Subtarget->hasAVX();
- bool X86ScalarSSEf32 = Subtarget->hasSSE1();
- bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+ bool HasSSE1 = Subtarget->hasSSE1();
+ bool HasSSE2 = Subtarget->hasSSE2();
switch (VT.getSimpleVT().SimpleTy) {
default: return 0;
@@ -1372,15 +1363,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
case MVT::i32: return X86::CMP32rr;
case MVT::i64: return X86::CMP64rr;
case MVT::f32:
- return X86ScalarSSEf32
- ? (HasAVX512 ? X86::VUCOMISSZrr
- : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
- : 0;
+ return HasAVX512 ? X86::VUCOMISSZrr
+ : HasAVX ? X86::VUCOMISSrr
+ : HasSSE1 ? X86::UCOMISSrr
+ : 0;
case MVT::f64:
- return X86ScalarSSEf64
- ? (HasAVX512 ? X86::VUCOMISDZrr
- : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
- : 0;
+ return HasAVX512 ? X86::VUCOMISDZrr
+ : HasAVX ? X86::VUCOMISDrr
+ : HasSSE2 ? X86::UCOMISDrr
+ : 0;
}
}
@@ -2036,7 +2027,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
/// the select.
bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// Check if the subtarget supports these instructions.
- if (!Subtarget->hasCMov())
+ if (!Subtarget->canUseCMOV())
return false;
// FIXME: Add support for i8.
@@ -2289,12 +2280,13 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
default: return false;
case MVT::i8: Opc = X86::CMOV_GR8; break;
case MVT::i16: Opc = X86::CMOV_GR16; break;
- case MVT::f16: Opc = X86::CMOV_FR16X; break;
case MVT::i32: Opc = X86::CMOV_GR32; break;
- case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
- : X86::CMOV_FR32; break;
- case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
- : X86::CMOV_FR64; break;
+ case MVT::f16:
+ Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
+ case MVT::f32:
+ Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
+ case MVT::f64:
+ Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
}
const Value *Cond = I->getOperand(0);
@@ -2495,7 +2487,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
}
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
- if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+ if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
I->getOperand(0)->getType()->isFloatTy()) {
bool HasAVX512 = Subtarget->hasAVX512();
// fpext from float to double.
@@ -2509,7 +2501,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
}
bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
- if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+ if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
I->getOperand(0)->getType()->isDoubleTy()) {
bool HasAVX512 = Subtarget->hasAVX512();
// fptrunc from double to float.
@@ -3733,25 +3725,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
+ bool HasSSE1 = Subtarget->hasSSE1();
+ bool HasSSE2 = Subtarget->hasSSE2();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX512 = Subtarget->hasAVX512();
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
- if (X86ScalarSSEf32)
- Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
- HasAVX ? X86::VMOVSSrm_alt :
- X86::MOVSSrm_alt;
- else
- Opc = X86::LD_Fp32m;
+ Opc = HasAVX512 ? X86::VMOVSSZrm_alt
+ : HasAVX ? X86::VMOVSSrm_alt
+ : HasSSE1 ? X86::MOVSSrm_alt
+ : X86::LD_Fp32m;
break;
case MVT::f64:
- if (X86ScalarSSEf64)
- Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
- HasAVX ? X86::VMOVSDrm_alt :
- X86::MOVSDrm_alt;
- else
- Opc = X86::LD_Fp64m;
+ Opc = HasAVX512 ? X86::VMOVSDZrm_alt
+ : HasAVX ? X86::VMOVSDrm_alt
+ : HasSSE2 ? X86::MOVSDrm_alt
+ : X86::LD_Fp64m;
break;
case MVT::f80:
// No f80 support yet.
@@ -3852,11 +3842,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
default:
break;
case MVT::f32:
- if (!X86ScalarSSEf32)
+ if (!Subtarget->hasSSE1())
Opc = X86::LD_Fp032;
break;
case MVT::f64:
- if (!X86ScalarSSEf64)
+ if (!Subtarget->hasSSE2())
Opc = X86::LD_Fp064;
break;
case MVT::f80:
@@ -3907,21 +3897,24 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
return 0;
// Get opcode and regclass for the given zero.
+ bool HasSSE1 = Subtarget->hasSSE1();
+ bool HasSSE2 = Subtarget->hasSSE2();
bool HasAVX512 = Subtarget->hasAVX512();
unsigned Opc = 0;
switch (VT.SimpleTy) {
default: return 0;
+ case MVT::f16:
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
+ break;
case MVT::f32:
- if (X86ScalarSSEf32)
- Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
- else
- Opc = X86::LD_Fp032;
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
+ : HasSSE1 ? X86::FsFLD0SS
+ : X86::LD_Fp032;
break;
case MVT::f64:
- if (X86ScalarSSEf64)
- Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
- else
- Opc = X86::LD_Fp064;
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
+ : HasSSE2 ? X86::FsFLD0SD
+ : X86::LD_Fp064;
break;
case MVT::f80:
// No f80 support yet.
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
new file mode 100644
index 000000000000..7e5540022cc8
--- /dev/null
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -0,0 +1,709 @@
+//===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to preconfig the shape of physical tile registers
+/// It inserts ldtilecfg ahead of each group of tile registers. The algorithm
+/// walk each instruction of basic block in reverse order. All the tile
+/// registers that live out the basic block would be spilled and reloaded
+/// before its user. It also check the depenedency of the shape to ensure
+/// the shape is defined before ldtilecfg.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fastpretileconfig"
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads, "Number of loads added");
+
+namespace {
+
+class X86FastPreTileConfig : public MachineFunctionPass {
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ X86MachineFunctionInfo *X86FI = nullptr;
+ MachineFrameInfo *MFI = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ MachineBasicBlock *MBB = nullptr;
+ int CfgSS = -1;
+ struct PHIInfo {
+ Register Row;
+ Register Col;
+ Register StackAddr;
+ };
+ DenseMap<MachineInstr *, struct PHIInfo> VisitedPHIs;
+
+ /// Maps virtual regs to the frame index where these values are spilled.
+ IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+ /// Has a bit set for tile virtual register for which it was determined
+ /// that it is alive across blocks.
+ BitVector MayLiveAcrossBlocks;
+
+ int getStackSpaceFor(Register VirtReg);
+ void InitializeTileConfigStackSpace();
+ bool mayLiveOut(Register VirtReg, MachineInstr *CfgMI);
+ void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill);
+ void reload(MachineBasicBlock::iterator UseMI, Register VirtReg,
+ MachineOperand *RowMO, MachineOperand *ColMO);
+ void canonicalizePHIs(MachineBasicBlock &MBB);
+ void convertPHI(MachineBasicBlock *MBB, MachineInstr &PHI);
+ void convertPHIs(MachineBasicBlock &MBB);
+ bool configBasicBlock(MachineBasicBlock &MBB);
+
+public:
+ X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Fast Tile Register Preconfigure";
+ }
+
+ /// Perform tile register configure.
+ bool runOnMachineFunction(MachineFunction &MFunc) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86FastPreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Preconfigure", false, false)
+INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE,
+ "Fast Tile Register Preconfigure", false, false)
+
+static bool dominates(MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator A,
+ MachineBasicBlock::const_iterator B) {
+ auto MBBEnd = MBB.end();
+ if (B == MBBEnd)
+ return true;
+
+ MachineBasicBlock::const_iterator I = MBB.begin();
+ for (; &*I != A && &*I != B; ++I)
+ ;
+
+ return &*I == A;
+}
+
+/// This allocates space for the specified virtual register to be held on the
+/// stack.
+int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) {
+ // Find the location Reg would belong...
+ int SS = StackSlotForVirtReg[VirtReg];
+ // Already has space allocated?
+ if (SS != -1)
+ return SS;
+
+ // Allocate a new stack object for this spill location...
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ unsigned Size = TRI->getSpillSize(RC);
+ Align Alignment = TRI->getSpillAlign(RC);
+ int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
+
+ // Assign the slot.
+ StackSlotForVirtReg[VirtReg] = FrameIdx;
+ return FrameIdx;
+}
+
+/// Returns false if \p VirtReg is known to not live out of the current config.
+/// If \p VirtReg live out of the current MBB, it must live out of the current
+/// config
+bool X86FastPreTileConfig::mayLiveOut(Register VirtReg, MachineInstr *CfgMI) {
+ if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
+ return true;
+
+ for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) {
+ if (UseInst.getParent() != MBB) {
+ MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+ return true;
+ }
+
+ // The use and def are in the same MBB. If the tile register is
+ // reconfigured, it is crobbered and we need to spill and reload
+ // tile register.
+ if (CfgMI) {
+ if (dominates(*MBB, *CfgMI, UseInst)) {
+ MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+void X86FastPreTileConfig::InitializeTileConfigStackSpace() {
+ MachineBasicBlock &MBB = MF->front();
+ MachineInstr *MI = &*MBB.getFirstNonPHI();
+ DebugLoc DL;
+ if (ST->hasAVX512()) {
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS)
+ .addReg(Zmm);
+ } else if (ST->hasAVX2()) {
+ Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS)
+ .addReg(Ymm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS,
+ 32)
+ .addReg(Ymm);
+ } else {
+ assert(ST->hasSSE2() && "AMX should assume SSE2 enabled");
+ unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
+ BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32)
+ .addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48)
+ .addReg(Xmm);
+ }
+ // Fill in the palette first.
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS)
+ .addImm(1);
+}
+
+/// Insert spill instruction for \p AssignedReg before \p Before.
+/// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot.
+void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
+ Register VirtReg, bool Kill) {
+ LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n");
+ int FI = getStackSpaceFor(VirtReg);
+ LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
+
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ // Don't need shape information for tile store, becasue it is adjacent to
+ // the tile def instruction.
+ TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI);
+ ++NumStores;
+
+ // TODO: update DBG_VALUEs
+}
+
+/// Insert reload instruction for \p PhysReg before \p Before.
+void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
+ Register OrigReg, MachineOperand *RowMO,
+ MachineOperand *ColMO) {
+ int FI = getStackSpaceFor(OrigReg);
+ const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg);
+ Register TileReg;
+ // Fold copy to tileload
+ // BB1:
+ // spill src to s
+ //
+ // BB2:
+ // t = copy src
+ // -->
+ // t = tileload (s)
+ if (UseMI->isCopy())
+ TileReg = UseMI->getOperand(0).getReg();
+ else
+ TileReg = MRI->createVirtualRegister(&RC);
+ // Can't use TII->loadRegFromStackSlot(), because we need the shape
+ // information for reload.
+ // tileloadd (%sp, %idx), %tmm
+ unsigned Opc = X86::PTILELOADDV;
+ Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ // FIXME: MBB is not the parent of UseMI.
+ MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(),
+ TII->get(X86::MOV64ri), StrideReg)
+ .addImm(64);
+ NewMI = addFrameReference(
+ BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg)
+ .addReg(RowMO->getReg())
+ .addReg(ColMO->getReg()),
+ FI);
+ MachineOperand &MO = NewMI->getOperand(5);
+ MO.setReg(StrideReg);
+ MO.setIsKill(true);
+ RowMO->setIsKill(false);
+ ColMO->setIsKill(false);
+ // Erase copy instruction after it is folded.
+ if (UseMI->isCopy()) {
+ UseMI->eraseFromParent();
+ } else {
+ // Replace the register in the user MI.
+ for (auto &MO : UseMI->operands()) {
+ if (MO.isReg() && MO.getReg() == OrigReg)
+ MO.setReg(TileReg);
+ }
+ }
+
+ ++NumLoads;
+ LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into "
+ << printReg(TileReg, TRI) << '\n');
+}
+
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+ // The instruction must have 3 operands: tile def, row, col.
+ if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo())
+ return false;
+ MachineOperand &MO = MI.getOperand(0);
+
+ if (MO.isReg()) {
+ Register Reg = MO.getReg();
+ // FIXME it may be used after Greedy RA and the physical
+ // register is not rewritten yet.
+ if (Reg.isVirtual() &&
+ MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+ return true;
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return true;
+ }
+
+ return false;
+}
+
+static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
+ MachineInstr *MI = MRI->getVRegDef(TileReg);
+ if (isTileDef(MRI, *MI)) {
+ MachineOperand *RowMO = &MI->getOperand(1);
+ MachineOperand *ColMO = &MI->getOperand(2);
+ return ShapeT(RowMO, ColMO, MRI);
+ } else if (MI->isCopy()) {
+ TileReg = MI->getOperand(1).getReg();
+ return getShape(MRI, TileReg);
+ }
+
+ // The def should not be PHI node, because we walk the MBB in reverse post
+ // order.
+ assert(MI->isPHI() && "Unexpected PHI when get shape.");
+ llvm_unreachable("Unexpected MI when get shape.");
+}
+
+// BB0:
+// spill t0 to s0
+// BB1:
+// spill t1 to s1
+//
+// BB2:
+// t = phi [t0, bb0] [t1, bb1]
+// -->
+// row = phi [r0, bb0] [r1, bb1]
+// col = phi [c0, bb0] [c1, bb1]
+// s = phi [s0, bb0] [s1, bb1]
+// t = tileload row, col, s
+// The new instruction is inserted at the end of the phi node. The order
+// of the original phi node is not ensured.
+void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB,
+ MachineInstr &PHI) {
+ // 1. Create instruction to get stack slot address of each incoming block.
+ // 2. Create PHI node for the stack address.
+ // 3. Create PHI node for shape. If one of the incoming shape is immediate
+ // use the immediate and delete the PHI node.
+ // 4. Create tileload instruction from the stack address.
+ Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+ TII->get(X86::PHI), StackAddrReg);
+ Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass);
+ MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+ TII->get(X86::PHI), RowReg);
+ Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass);
+ MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(),
+ TII->get(X86::PHI), ColReg);
+ // Record the mapping of phi node and its row/column information.
+ VisitedPHIs[&PHI] = {RowReg, ColReg, StackAddrReg};
+
+ for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) {
+ // Get the 2 incoming value of tile register and MBB.
+ Register InTileReg = PHI.getOperand(I).getReg();
+ // Mark it as liveout, so that it will be spilled when visit
+ // the incoming MBB. Otherwise since phi will be deleted, it
+ // would miss spill when visit incoming MBB.
+ MayLiveAcrossBlocks.set(Register::virtReg2Index(InTileReg));
+ MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB();
+
+ MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg);
+ MachineBasicBlock::iterator InsertPos;
+ if (TileDefMI->isPHI()) {
+ InsertPos = TileDefMI->getParent()->getFirstNonPHI();
+ if (VisitedPHIs.count(TileDefMI)) { // circular phi reference
+ // def t1
+ // / \
+ // def t2 t3 = phi(t1, t4) <--
+ // \ / |
+ // t4 = phi(t2, t3)-------------
+ //
+ // For each (row, column and stack address) append phi incoming value.
+ // Create r3 = phi(r1, r4)
+ // Create r4 = phi(r2, r3)
+ Register InRowReg = VisitedPHIs[TileDefMI].Row;
+ Register InColReg = VisitedPHIs[TileDefMI].Col;
+ Register InStackAddrReg = VisitedPHIs[TileDefMI].StackAddr;
+ RowPHI.addReg(InRowReg).addMBB(InMBB);
+ ColPHI.addReg(InColReg).addMBB(InMBB);
+ AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+ continue;
+ } else {
+ // Recursively convert PHI to tileload
+ convertPHI(TileDefMI->getParent(), *TileDefMI);
+ // The PHI node is coverted to tileload instruction. Get the stack
+ // address from tileload operands.
+ MachineInstr *TileLoad = MRI->getVRegDef(InTileReg);
+ assert(TileLoad && TileLoad->getOpcode() == X86::PTILELOADDV);
+ Register InRowReg = TileLoad->getOperand(1).getReg();
+ Register InColReg = TileLoad->getOperand(2).getReg();
+ Register InStackAddrReg = TileLoad->getOperand(3).getReg();
+ RowPHI.addReg(InRowReg).addMBB(InMBB);
+ ColPHI.addReg(InColReg).addMBB(InMBB);
+ AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+ }
+ } else {
+ InsertPos = TileDefMI->getIterator();
+
+ // Fill the incoming operand of row/column phi instruction.
+ ShapeT Shape = getShape(MRI, InTileReg);
+ Shape.getRow()->setIsKill(false);
+ Shape.getCol()->setIsKill(false);
+ RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB);
+ ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB);
+
+ // The incoming tile register live out of its def BB, it would be spilled.
+ // Create MI to get the spill stack slot address for the tile register
+ int FI = getStackSpaceFor(InTileReg);
+ Register InStackAddrReg =
+ MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ addOffset(BuildMI(*TileDefMI->getParent(), InsertPos, DebugLoc(),
+ TII->get(X86::LEA64r), InStackAddrReg)
+ .addFrameIndex(FI),
+ 0);
+ AddrPHI.addReg(InStackAddrReg).addMBB(InMBB);
+ }
+ }
+
+ MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
+ Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg)
+ .addImm(64);
+ Register TileReg = PHI.getOperand(0).getReg();
+ MachineInstr *NewMI = addDirectMem(
+ BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg)
+ .addReg(RowReg)
+ .addReg(ColReg),
+ StackAddrReg);
+ MachineOperand &MO = NewMI->getOperand(5);
+ MO.setReg(StrideReg);
+ MO.setIsKill(true);
+ PHI.eraseFromParent();
+ VisitedPHIs.erase(&PHI);
+}
+
+static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+ MachineOperand &MO = MI.getOperand(0);
+ if (MO.isReg() && MO.getReg().isVirtual() &&
+ MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
+ return true;
+ return false;
+}
+
+void X86FastPreTileConfig::canonicalizePHIs(MachineBasicBlock &MBB) {
+ SmallVector<MachineInstr *, 8> PHIs;
+
+ for (MachineInstr &MI : MBB) {
+ if (!MI.isPHI())
+ break;
+ if (!isTileRegDef(MRI, MI))
+ continue;
+ PHIs.push_back(&MI);
+ }
+ // Canonicalize the phi node first. One tile phi may depeneds previous
+ // phi node. For below case, we need convert %t4.
+ //
+ // BB0:
+ // %t3 = phi (t1 BB1, t2 BB0)
+ // %t4 = phi (t5 BB1, t3 BB0)
+ // -->
+ // %t3 = phi (t1 BB1, t2 BB0)
+ // %t4 = phi (t5 BB1, t2 BB0)
+ //
+ while (!PHIs.empty()) {
+ MachineInstr *PHI = PHIs.pop_back_val();
+
+ // Find the operand that is incoming from the same MBB and the def
+ // is also phi node.
+ MachineOperand *InMO = nullptr;
+ MachineInstr *DefMI = nullptr;
+ for (unsigned I = 1, E = PHI->getNumOperands(); I != E; I += 2) {
+ Register InTileReg = PHI->getOperand(I).getReg();
+ MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
+ DefMI = MRI->getVRegDef(InTileReg);
+ if (InMBB != &MBB || !DefMI->isPHI())
+ continue;
+
+ InMO = &PHI->getOperand(I);
+ break;
+ }
+ // If can't find such operand, do nothing.
+ if (!InMO)
+ continue;
+
+ // Current phi node depends on previous phi node. Break the
+ // dependency.
+ Register DefTileReg;
+ for (unsigned I = 1, E = DefMI->getNumOperands(); I != E; I += 2) {
+ MachineBasicBlock *InMBB = PHI->getOperand(I + 1).getMBB();
+ if (InMBB != &MBB)
+ continue;
+ DefTileReg = DefMI->getOperand(I).getReg();
+ InMO->setReg(DefTileReg);
+ break;
+ }
+ }
+}
+
+void X86FastPreTileConfig::convertPHIs(MachineBasicBlock &MBB) {
+ SmallVector<MachineInstr *, 8> PHIs;
+ for (MachineInstr &MI : MBB) {
+ if (!MI.isPHI())
+ break;
+ if (!isTileRegDef(MRI, MI))
+ continue;
+ PHIs.push_back(&MI);
+ }
+ while (!PHIs.empty()) {
+ MachineInstr *MI = PHIs.pop_back_val();
+ VisitedPHIs.clear();
+ convertPHI(&MBB, *MI);
+ }
+}
+
+// PreTileConfig should configure the tile registers based on basic
+// block.
+bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
+ this->MBB = &MBB;
+ bool Change = false;
+ MachineInstr *LastShapeMI = nullptr;
+ MachineInstr *LastTileCfg = nullptr;
+ bool HasUnconfigTile = false;
+
+ auto Config = [&](MachineInstr &Before) {
+ if (CfgSS == -1)
+ CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(),
+ ST->getTileConfigAlignment(), false);
+ LastTileCfg = addFrameReference(
+ BuildMI(MBB, Before, DebugLoc(), TII->get(X86::PLDTILECFGV)), CfgSS);
+ LastShapeMI = nullptr;
+ Change = true;
+ };
+ auto HasTileOperand = [](MachineRegisterInfo *MRI, MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg.isVirtual() &&
+ MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+ return true;
+ }
+ return false;
+ };
+ for (MachineInstr &MI : reverse(MBB)) {
+ // We have transformed phi node before configuring BB.
+ if (MI.isPHI())
+ break;
+ // Don't collect the shape of used tile, the tile should be defined
+ // before the tile use. Spill and reload would happen if there is only
+ // tile use after ldtilecfg, so the shape can be collected from reload.
+ // Take below code for example. %t would be reloaded before tilestore
+ // call
+ // ....
+ // tilestore %r, %c, %t
+ // -->
+ // call
+ // ldtilecfg
+ // %t = tileload %r, %c
+ // tilestore %r, %c, %t
+ if (HasTileOperand(MRI, MI))
+ HasUnconfigTile = true;
+ // According to AMX ABI, all the tile registers including config register
+ // are volatile. Caller need to save/restore config register.
+ if (MI.isCall() && HasUnconfigTile) {
+ MachineBasicBlock::iterator I;
+ if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
+ I = ++LastShapeMI->getIterator();
+ else
+ I = ++MI.getIterator();
+ Config(*I);
+ HasUnconfigTile = false;
+ continue;
+ }
+ if (!isTileDef(MRI, MI))
+ continue;
+ //
+ //---------------------------------------------------------------------
+ // Don't handle COPY instruction. If the src and dst of the COPY can be
+ // in the same config in below case, we just check the shape of t0.
+ // def row0
+ // def col0
+ // ldtilecfg
+ // t0 = tielzero(row0, col0)
+ // t1 = copy t0
+ // ...
+ // If the src and dst of the COPY can NOT be in the same config in below
+ // case. Reload would be generated befor the copy instruction.
+ // def row0
+ // def col0
+ // t0 = tielzero(row0, col0)
+ // spill t0
+ // ...
+ // def row1
+ // def col1
+ // ldtilecfg
+ // t1 = tilezero(row1, col1)
+ // reload t0
+ // t1 = copy t0
+ //---------------------------------------------------------------------
+ //
+ // If MI dominate the last shape def instruction, we need insert
+ // ldtilecfg after LastShapeMI now. The config doesn't include
+ // current MI.
+ // def row0
+ // def col0
+ // tilezero(row0, col0) <- MI
+ // def row1
+ // def col1
+ // ldtilecfg <- insert
+ // tilezero(row1, col1)
+ if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
+ Config(*(++LastShapeMI->getIterator()));
+ MachineOperand *RowMO = &MI.getOperand(1);
+ MachineOperand *ColMO = &MI.getOperand(2);
+ MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg());
+ MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg());
+ // If the shape is defined in current MBB, check the domination.
+ // FIXME how about loop?
+ if (RowMI->getParent() == &MBB) {
+ if (!LastShapeMI)
+ LastShapeMI = RowMI;
+ else if (dominates(MBB, LastShapeMI, RowMI))
+ LastShapeMI = RowMI;
+ }
+ if (ColMI->getParent() == &MBB) {
+ if (!LastShapeMI)
+ LastShapeMI = ColMI;
+ else if (dominates(MBB, LastShapeMI, ColMI))
+ LastShapeMI = ColMI;
+ }
+ // If there is user live out of the tilecfg, spill it and reload in
+ // before the user.
+ Register TileReg = MI.getOperand(0).getReg();
+ if (mayLiveOut(TileReg, LastTileCfg))
+ spill(++MI.getIterator(), TileReg, false);
+ for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) {
+ if (UseMI.getParent() == &MBB) {
+ // check user should not across ldtilecfg
+ if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI))
+ continue;
+ // reload befor UseMI
+ reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
+ } else {
+ // Don't reload for phi instruction, we handle phi reload separately.
+ // TODO: merge the reload for the same user MBB.
+ if (!UseMI.isPHI())
+ reload(UseMI.getIterator(), TileReg, RowMO, ColMO);
+ }
+ }
+ }
+
+ // Configure tile registers at the head of the MBB
+ if (HasUnconfigTile) {
+ MachineInstr *Before;
+ if (LastShapeMI == nullptr || LastShapeMI->isPHI())
+ Before = &*MBB.getFirstNonPHI();
+ else
+ Before = &*(++LastShapeMI->getIterator());
+
+ Config(*Before);
+ }
+
+ return Change;
+}
+
+bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+ MF = &MFunc;
+ MRI = &MFunc.getRegInfo();
+ ST = &MFunc.getSubtarget<X86Subtarget>();
+ TII = ST->getInstrInfo();
+ X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+ MFI = &MFunc.getFrameInfo();
+ TRI = ST->getRegisterInfo();
+ CfgSS = -1;
+
+ unsigned NumVirtRegs = MRI->getNumVirtRegs();
+ // Abandon early if there is no tile register to config.
+ bool HasVirtTileReg = false;
+ for (unsigned I = 0, E = NumVirtRegs; I != E; ++I) {
+ Register VirtReg = Register::index2VirtReg(I);
+ if (MRI->getRegClass(VirtReg)->getID() == X86::TILERegClassID) {
+ HasVirtTileReg = true;
+ break;
+ }
+ }
+ if (!HasVirtTileReg)
+ return false;
+
+ StackSlotForVirtReg.resize(NumVirtRegs);
+ MayLiveAcrossBlocks.clear();
+ // We will create register during config. *3 is to make sure
+ // the virtual register number doesn't exceed the size of
+ // the bit vector.
+ MayLiveAcrossBlocks.resize(NumVirtRegs * 3);
+ bool Change = false;
+ assert(MRI->isSSA());
+
+ // Canonicalize the phi node first.
+ for (MachineBasicBlock &MBB : MFunc)
+ canonicalizePHIs(MBB);
+
+ // Loop over all of the basic blocks in reverse post order and insert
+ // ldtilecfg for tile registers. The reserse post order is to facilitate
+ // PHI node convert.
+ ReversePostOrderTraversal<MachineFunction *> RPOT(MF);
+ for (MachineBasicBlock *MBB : RPOT) {
+ convertPHIs(*MBB);
+ Change |= configBasicBlock(*MBB);
+ }
+
+ if (Change)
+ InitializeTileConfigStackSpace();
+
+ StackSlotForVirtReg.clear();
+ return Change;
+}
+
+FunctionPass *llvm::createX86FastPreTileConfigPass() {
+ return new X86FastPreTileConfig();
+}
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 061fff50bcea..2a20cd13791d 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -40,40 +40,25 @@ namespace {
class X86FastTileConfig : public MachineFunctionPass {
// context
MachineFunction *MF = nullptr;
- const X86Subtarget *ST = nullptr;
- const TargetRegisterInfo *TRI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
X86MachineFunctionInfo *X86FI = nullptr;
- MachineInstr *getTileConfigPoint();
- void tileConfig();
+ bool configBasicBlock(MachineBasicBlock &MBB);
public:
X86FastTileConfig() : MachineFunctionPass(ID) {}
- bool fastTileConfig();
- bool isTileLoad(MachineInstr &MI);
- bool isTileStore(MachineInstr &MI);
- bool isAMXInstr(MachineInstr &MI);
-
- MachineInstr *getKeyAMXInstr(MachineInstr *MI);
- void getTileShapesCfg(MachineInstr *MI,
- SmallVector<MachineOperand *> &ShapedTiles);
- void getShapeCfgInstrs(MachineInstr *MI,
- std::map<unsigned, MachineInstr *> &RowCfgs,
- std::map<unsigned, MachineInstr *> &ColCfgs);
-
/// Return the pass name.
StringRef getPassName() const override {
return "Fast Tile Register Configure";
}
- void materializeTileCfg(MachineInstr *MI);
-
- void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
- std::map<unsigned, MachineInstr *> &RowCfgs,
- std::map<unsigned, MachineInstr *> &ColCfgs);
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &MFunc) override;
@@ -95,209 +80,107 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
"Fast Tile Register Configure", false, false)
-static bool isTilePhysReg(MachineOperand &Op) {
- if (!Op.isReg())
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+ // There is no phi instruction after register allocation.
+ assert(MI.isPHI() == false);
+ // The instruction must have 3 operands: tile def, row, col.
+ // It should be AMX pseudo instruction that have shape operand.
+ if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
+ !MI.isPseudo())
return false;
+ MachineOperand &MO = MI.getOperand(0);
+
+ if (MO.isReg()) {
+ Register Reg = MO.getReg();
+ // FIXME it may be used after Greedy RA and the physical
+ // register is not rewritten yet.
+ if (Reg.isVirtual() &&
+ MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+ return true;
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return true;
+ }
- Register Reg = Op.getReg();
- if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return true;
return false;
}
-static unsigned getTilePhysRegIdx(MachineOperand *Op) {
- assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
- return Op->getReg() - X86::TMM0;
-}
-
-static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
- unsigned Offset = 48 + TIdx;
- MI->getOperand(3).ChangeToImmediate(Offset);
-}
-
-static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
- unsigned Offset = 16 + TIdx * 2;
- MI->getOperand(3).ChangeToImmediate(Offset);
-}
-
-bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
- return MI.getOpcode() == X86::PTILELOADDV ||
- MI.getOpcode() == X86::PTILELOADDT1V;
-}
-bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
- return MI.getOpcode() == X86::PTILESTOREDV;
-}
-bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
- // TODO: May need to handle some special nontile amx instrucion.
- if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
- return false;
-
- return llvm::any_of(MI.operands(), isTilePhysReg);
-}
-
-MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
- auto Cfg = MachineBasicBlock::iterator(MI);
- MachineBasicBlock *MBB = MI->getParent();
- MachineInstr *KeyMI = nullptr;
- int KeyAMXNum = 0;
-
- for (auto II = Cfg; II != MBB->end(); II++) {
- if (isTileLoad(*II)) {
- KeyMI = &*II;
+// PreTileConfig should configure the tile registers based on basic
+// block.
+bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
+ bool Change = false;
+ SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
+ for (MachineInstr &MI : reverse(MBB)) {
+ if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
continue;
+ // AMX instructions that define tile register.
+ if (MI.getOpcode() != X86::PLDTILECFGV) {
+ MachineOperand &Row = MI.getOperand(1);
+ MachineOperand &Col = MI.getOperand(2);
+ unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0;
+ ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
+ } else { // PLDTILECFGV
+ // Rewrite the shape information to memory. Stack slot should have
+ // been initialized to zero in pre config.
+ int SS = MI.getOperand(0).getIndex(); // tile config stack slot.
+ for (auto &ShapeInfo : ShapeInfos) {
+ DebugLoc DL;
+ unsigned TMMIdx = ShapeInfo.first;
+ Register RowReg = ShapeInfo.second.getRow()->getReg();
+ Register ColReg = ShapeInfo.second.getCol()->getReg();
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ int RowOffset = 48 + TMMIdx;
+ int ColOffset = 16 + TMMIdx * 2;
+
+ Register SubRowReg = TRI->getSubReg(RowReg, X86::sub_8bit);
+ BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), SubRowReg);
+ MachineInstrBuilder StoreRow =
+ BuildMI(MBB, MI, DL, TII->get(X86::MOV8mr));
+ addFrameReference(StoreRow, SS, RowOffset).addReg(SubRowReg);
+
+ MachineInstrBuilder StoreCol =
+ BuildMI(MBB, MI, DL, TII->get(X86::MOV16mr));
+ addFrameReference(StoreCol, SS, ColOffset).addReg(ColReg);
+ }
+ ShapeInfos.clear();
+ Change = true;
}
-
- if (isTileStore(*II)) {
- assert(KeyMI && "Key AMX Should be found before!");
- break;
- }
-
- if (isAMXInstr(*II)) {
- assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
- KeyAMXNum++;
- KeyMI = &*II;
- }
- }
- assert(KeyMI && "There must be an AMX instruction.");
- return KeyMI;
-}
-
-// Orderly get the tiles in key amx instruction, uses before defs.
-void X86FastTileConfig::getTileShapesCfg(
- MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
- MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
-
- SmallVector<MachineOperand *> DefTiles;
- for (MachineOperand &MO : KeyMI->operands()) {
- if (!isTilePhysReg(MO))
- continue;
- if (MO.isDef())
- DefTiles.push_back(&MO);
- else
- ShapedTiles.push_back(&MO);
- }
- ShapedTiles.append(DefTiles);
-}
-
-// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
-// amx.shape.N.col*" at pass "Pre AMX Tile Config".
-// The 'N' implies the order of tiles in key amx intrinsic.
-void X86FastTileConfig::getShapeCfgInstrs(
- MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
- std::map<unsigned, MachineInstr *> &ColCfgs) {
- auto Cfg = MachineBasicBlock::iterator(MI);
- MachineBasicBlock *MBB = MI->getParent();
-
- for (auto II = Cfg; II != MBB->begin(); II--) {
- if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
- break;
- if (!II->mayStore() || !II->hasOneMemOperand())
- continue;
- const Value *MemPtr = II->memoperands()[0]->getValue();
- if (!MemPtr)
- continue;
-
- StringRef Name = MemPtr->getName();
- if (!Name.startswith("amx.tmm."))
- continue;
-
- // Get the 'N'th tile shape config in key amx instruction.
- auto N = Name.find(".shape");
- StringRef STileIdx = Name.slice(8, N);
- unsigned Idx;
- STileIdx.getAsInteger(10, Idx);
-
- // And related them with their store instructions.
- if (Name.contains("row"))
- RowCfgs[Idx] = &*II;
- else if (Name.contains("col"))
- ColCfgs[Idx] = &*II;
- else
- llvm_unreachable("Invalid tile shape info!");
}
- assert((RowCfgs.size() == ColCfgs.size()) &&
- "The number of tile row and col must be equal!");
-}
-
-// Here is the data format for the tile config.
-// 0 palette = 1 now.
-// 1 start_row = 0 now.
-// 2-15 reserved, must be zero
-// 16-17 tile0.colsb Tile 0 bytes per row.
-// 18-19 tile1.colsb Tile 1 bytes per row.
-// 20-21 tile2.colsb Tile 2 bytes per row.
-// ... (sequence continues)
-// 30-31 tile7.colsb Tile 7 bytes per row.
-// 32-47 reserved, must be zero
-// 48 tile0.rows Tile 0 rows.
-// 49 tile1.rows Tile 1 rows.
-// 50 tile2.rows Tile 2 rows.
-// ... (sequence continues)
-// 55 tile7.rows Tile 7 rows.
-// 56-63 reserved, must be zero
-void X86FastTileConfig::rewriteTileCfg(
- SmallVector<MachineOperand *> &ShapedTiles,
- std::map<unsigned, MachineInstr *> &RowCfgs,
- std::map<unsigned, MachineInstr *> &ColCfgs) {
- assert((RowCfgs.size() == ShapedTiles.size()) &&
- "The number of tile shapes not equal with the number of tiles!");
- // Orderly get the tiles and adjust the shape config.
- for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
- MachineOperand *MO = ShapedTiles[I];
- unsigned TmmIdx = getTilePhysRegIdx(MO);
- if (I == TmmIdx)
- continue;
- adjustRowCfg(TmmIdx, RowCfgs[I]);
- adjustColCfg(TmmIdx, ColCfgs[I]);
- }
-}
-
-// We have already preconfig the shapes before fast register allocation at
-// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
-// allocation, the shapes pre-written before may not rightly corresponding
-// to the correct tmm registers, so we need adjust them.
-void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
- SmallVector<MachineOperand *> ShapedTiles;
- std::map<unsigned, MachineInstr *> RowCfgs;
- std::map<unsigned, MachineInstr *> ColCfgs;
-
- // Orderly keep the tile uses and def in ShapedTiles;
- getTileShapesCfg(CfgMI, ShapedTiles);
- assert(ShapedTiles.size() && "Not find shapes config!");
-
- getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
-
- rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
-}
-
-bool X86FastTileConfig::fastTileConfig() {
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : *MF) {
- SmallVector<MachineInstr *, 2> CFGs;
- for (MachineInstr &MI : MBB)
- if (MI.getOpcode() == X86::PLDTILECFGV)
- CFGs.push_back(&MI);
- for (auto *MI : CFGs)
- materializeTileCfg(MI);
- if (!CFGs.empty())
- Changed = true;
- }
- if (Changed)
+ if (Change)
X86FI->setHasVirtualTileReg(true);
- return Changed;
+
+ return Change;
}
bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
MF = &MFunc;
MRI = &MFunc.getRegInfo();
- ST = &MFunc.getSubtarget<X86Subtarget>();
+ const TargetSubtargetInfo *ST = &MFunc.getSubtarget<X86Subtarget>();
TRI = ST->getRegisterInfo();
TII = MFunc.getSubtarget().getInstrInfo();
X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
+ bool Change = false;
+
+ // Loop over all of the basic blocks, eliminating virtual register references
+ for (MachineBasicBlock &MBB : MFunc)
+ Change |= configBasicBlock(MBB);
- return fastTileConfig();
+ return Change;
}
FunctionPass *llvm::createX86FastTileConfigPass() {
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 4730b936ec1f..b01145809ac6 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -229,7 +229,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
bool IsSlowLEA = ST.slowLEA();
bool IsSlow3OpsLEA = ST.slow3OpsLEA();
- bool LEAUsesAG = ST.LEAusesAG();
+ bool LEAUsesAG = ST.leaUsesAG();
bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize();
bool UseLEAForSP = ST.useLeaForSP();
@@ -546,7 +546,6 @@ bool FixupLEAPass::optLEAALU(MachineBasicBlock::iterator &I,
if (KilledIndex)
KilledIndex->setIsKill(false);
- MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI1, 1);
MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI2, 1);
MBB.erase(I);
MBB.erase(AluI);
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 2f0ab4ca9de4..33f5bb365da8 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -99,17 +99,17 @@ namespace {
// but the exact mapping of FP registers to stack slots is fixed later.
struct LiveBundle {
// Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
- unsigned Mask;
+ unsigned Mask = 0;
// Number of pre-assigned live registers in FixStack. This is 0 when the
// stack order has not yet been fixed.
- unsigned FixCount;
+ unsigned FixCount = 0;
// Assigned stack order for live-in registers.
// FixStack[i] == getStackEntry(i) for all i < FixCount.
unsigned char FixStack[8];
- LiveBundle() : Mask(0), FixCount(0) {}
+ LiveBundle() = default;
// Have the live registers been assigned a stack order yet?
bool isFixed() const { return !Mask || FixCount; }
@@ -866,7 +866,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
if (Opcode != -1) {
I->setDesc(TII->get(Opcode));
if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
- I->RemoveOperand(0);
+ I->removeOperand(0);
MI.dropDebugNumber();
} else { // Insert an explicit pop
// If this instruction sets FPSW, which is read in following instruction,
@@ -1034,7 +1034,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
STReturns |= 1 << getFPReg(Op);
// Remove the operand so that later passes don't see it.
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
--i;
--e;
}
@@ -1098,7 +1098,7 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) {
LiveMask |= (1 << getFPReg(Op));
// Remove the operand so that later passes don't see it.
- MI.RemoveOperand(i);
+ MI.removeOperand(i);
--i;
--e;
}
@@ -1162,7 +1162,7 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
unsigned DestReg = getFPReg(MI.getOperand(0));
// Change from the pseudo instruction to the concrete instruction.
- MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI.removeOperand(0); // Remove the explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
MI.addOperand(
MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true));
@@ -1210,7 +1210,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
}
// Convert from the pseudo instruction to the concrete instruction.
- MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+ MI.removeOperand(NumOps - 1); // Remove explicit ST(0) operand
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
MI.addOperand(
MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true));
@@ -1263,8 +1263,8 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
}
// Change from the pseudo instruction to the concrete instruction.
- MI.RemoveOperand(1); // Drop the source operand.
- MI.RemoveOperand(0); // Drop the destination operand.
+ MI.removeOperand(1); // Drop the source operand.
+ MI.removeOperand(0); // Drop the destination operand.
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
MI.dropDebugNumber();
}
@@ -1464,7 +1464,7 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
// Change from the pseudo instruction to the concrete instruction.
MI.getOperand(0).setReg(getSTReg(Op1));
- MI.RemoveOperand(1);
+ MI.removeOperand(1);
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
MI.dropDebugNumber();
@@ -1489,8 +1489,8 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
// Change the second operand to the stack register that the operand is in.
// Change from the pseudo instruction to the concrete instruction.
- MI.RemoveOperand(0);
- MI.RemoveOperand(1);
+ MI.removeOperand(0);
+ MI.removeOperand(1);
MI.getOperand(0).setReg(getSTReg(Op1));
MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
MI.dropDebugNumber();
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 51f2ced321bb..d524090f902e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "X86FrameLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
@@ -19,6 +20,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -99,7 +101,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
MFI.hasStackMap() || MFI.hasPatchPoint() ||
- MFI.hasCopyImplyingStackAdjustment());
+ (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
}
static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
@@ -435,11 +437,13 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
- const MCCFIInstruction &CFIInst) const {
+ const MCCFIInstruction &CFIInst,
+ MachineInstr::MIFlag Flag) const {
MachineFunction &MF = *MBB.getParent();
unsigned CFIIndex = MF.addFrameInst(CFIInst);
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(Flag);
}
/// Emits Dwarf Info specifying offsets of callee saved registers and
@@ -492,6 +496,87 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
}
}
+void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
+ MachineBasicBlock &MBB) const {
+ const MachineFunction &MF = *MBB.getParent();
+
+ // Insertion point.
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+
+ // Fake a debug loc.
+ DebugLoc DL;
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ // Zero out FP stack if referenced. Do this outside of the loop below so that
+ // it's done only once.
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ for (MCRegister Reg : RegsToZero.set_bits()) {
+ if (!X86::RFP80RegClass.contains(Reg))
+ continue;
+
+ unsigned NumFPRegs = ST.is64Bit() ? 8 : 7;
+ for (unsigned i = 0; i != NumFPRegs; ++i)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));
+
+ for (unsigned i = 0; i != NumFPRegs; ++i)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);
+ break;
+ }
+
+ // For GPRs, we only care to clear out the 32-bit register.
+ BitVector GPRsToZero(TRI->getNumRegs());
+ for (MCRegister Reg : RegsToZero.set_bits())
+ if (TRI->isGeneralPurposeRegister(MF, Reg)) {
+ GPRsToZero.set(getX86SubSuperRegisterOrZero(Reg, 32));
+ RegsToZero.reset(Reg);
+ }
+
+ for (MCRegister Reg : GPRsToZero.set_bits())
+ BuildMI(MBB, MBBI, DL, TII.get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+
+ // Zero out registers.
+ for (MCRegister Reg : RegsToZero.set_bits()) {
+ if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
+ // FIXME: Ignore MMX registers?
+ continue;
+
+ unsigned XorOp;
+ if (X86::VR128RegClass.contains(Reg)) {
+ // XMM#
+ if (!ST.hasSSE1())
+ continue;
+ XorOp = X86::PXORrr;
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // YMM#
+ if (!ST.hasAVX())
+ continue;
+ XorOp = X86::VPXORrr;
+ } else if (X86::VR512RegClass.contains(Reg)) {
+ // ZMM#
+ if (!ST.hasAVX512())
+ continue;
+ XorOp = X86::VPXORYrr;
+ } else if (X86::VK1RegClass.contains(Reg) ||
+ X86::VK2RegClass.contains(Reg) ||
+ X86::VK4RegClass.contains(Reg) ||
+ X86::VK8RegClass.contains(Reg) ||
+ X86::VK16RegClass.contains(Reg)) {
+ if (!ST.hasVLX())
+ continue;
+ XorOp = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
+ } else {
+ continue;
+ }
+
+ BuildMI(MBB, MBBI, DL, TII.get(XorOp), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ }
+}
+
void X86FrameLowering::emitStackProbe(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
@@ -1289,6 +1374,9 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
}
+/// Return true if we need to use the restricted Windows x64 prologue and
+/// epilogue code patterns that can be described with WinCFI (.seh_*
+/// directives).
bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
}
@@ -1558,12 +1646,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided offset.
assert(StackSize);
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth),
+ MachineInstr::FrameSetup);
// Change the rule for the FramePtr to be an "offset" rule.
unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
- BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
- nullptr, DwarfFramePtr, 2 * stackGrowth));
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,
+ 2 * stackGrowth),
+ MachineInstr::FrameSetup);
}
if (NeedsWinCFI) {
@@ -1630,7 +1721,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
BuildCFI(
MBB, MBBI, DL,
- MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+ MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr),
+ MachineInstr::FrameSetup);
}
if (NeedsWinFPO) {
@@ -1681,7 +1773,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided offset.
assert(StackSize);
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
+ MachineInstr::FrameSetup);
StackOffset += stackGrowth;
}
@@ -1962,7 +2055,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
assert(StackSize);
BuildCFI(
MBB, MBBI, DL,
- MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth),
+ MachineInstr::FrameSetup);
}
// Emit DWARF info specifying the offsets of the callee-saved registers.
@@ -2145,11 +2239,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned DwarfStackPtr =
TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize));
+ MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
+ MachineInstr::FrameDestroy);
if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
BuildCFI(MBB, AfterPop, DL,
- MCCFIInstruction::createRestore(nullptr, DwarfFramePtr));
+ MCCFIInstruction::createRestore(nullptr, DwarfFramePtr),
+ MachineInstr::FrameDestroy);
--MBBI;
--AfterPop;
}
@@ -2226,7 +2322,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Define the current CFA rule to use the provided offset.
BuildCFI(MBB, MBBI, DL,
MCCFIInstruction::cfiDefCfaOffset(
- nullptr, CSSize + TailCallArgReserveSize + SlotSize));
+ nullptr, CSSize + TailCallArgReserveSize + SlotSize),
+ MachineInstr::FrameDestroy);
}
--MBBI;
}
@@ -2252,7 +2349,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Opc == X86::POP32r || Opc == X86::POP64r) {
Offset += SlotSize;
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
+ MachineInstr::FrameDestroy);
}
}
}
@@ -2830,17 +2928,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
// prologue.
StackSize = MFI.getStackSize();
- // Do not generate a prologue for leaf functions with a stack of size zero.
- // For non-leaf functions we have to allow for the possibility that the
- // callis to a non-split function, as in PR37807. This function could also
- // take the address of a non-split function. When the linker tries to adjust
- // its non-existent prologue, it would fail with an error. Mark the object
- // file so that such failures are not errors. See this Go language bug-report
- // https://go-review.googlesource.com/c/go/+/148819/
- if (StackSize == 0 && !MFI.hasTailCall()) {
- MF.getMMI().setHasNosplitStack(true);
+ if (!MFI.needsSplitStackProlog())
return;
- }
MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
@@ -3023,7 +3112,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
.addReg(0)
.addExternalSymbol("__morestack_addr")
.addReg(0);
- MF.getMMI().setUsesMorestackAddr(true);
} else {
if (Is64Bit)
BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 987facbfeae4..9b83fe77d505 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -176,7 +176,8 @@ public:
/// Wraps up getting a CFI index and building a MachineInstr for it.
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+ const DebugLoc &DL, const MCCFIInstruction &CFIInst,
+ MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
/// Sets up EBP and optionally ESI based on the incoming EBP value. Only
/// needed for 32-bit. Used in funclet prologues and at catchret destinations.
@@ -233,6 +234,10 @@ private:
const DebugLoc &DL, uint64_t Offset,
uint64_t Align) const;
+ /// Emit target zero call-used regs.
+ void emitZeroCallUsedRegs(BitVector RegsToZero,
+ MachineBasicBlock &MBB) const override;
+
void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
/// Aligns the stack pointer by ANDing it with -MaxAlign.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5b90c67deae6..f88037e95d33 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -59,30 +59,27 @@ namespace {
enum {
RegBase,
FrameIndexBase
- } BaseType;
+ } BaseType = RegBase;
// This is really a union, discriminated by BaseType!
SDValue Base_Reg;
- int Base_FrameIndex;
+ int Base_FrameIndex = 0;
- unsigned Scale;
+ unsigned Scale = 1;
SDValue IndexReg;
- int32_t Disp;
+ int32_t Disp = 0;
SDValue Segment;
- const GlobalValue *GV;
- const Constant *CP;
- const BlockAddress *BlockAddr;
- const char *ES;
- MCSymbol *MCSym;
- int JT;
+ const GlobalValue *GV = nullptr;
+ const Constant *CP = nullptr;
+ const BlockAddress *BlockAddr = nullptr;
+ const char *ES = nullptr;
+ MCSymbol *MCSym = nullptr;
+ int JT = -1;
Align Alignment; // CP alignment.
- unsigned char SymbolFlags; // X86II::MO_*
+ unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
bool NegateIndex = false;
- X86ISelAddressMode()
- : BaseType(RegBase), Base_FrameIndex(0), Scale(1), Disp(0), GV(nullptr),
- CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1),
- SymbolFlags(X86II::MO_NO_FLAG) {}
+ X86ISelAddressMode() = default;
bool hasSymbolicDisplacement() const {
return GV != nullptr || CP != nullptr || ES != nullptr ||
@@ -446,6 +443,43 @@ namespace {
return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
}
+ SDValue getSBBZero(SDNode *N) {
+ SDLoc dl(N);
+ MVT VT = N->getSimpleValueType(0);
+
+ // Create zero.
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+ SDValue Zero =
+ SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+ if (VT == MVT::i64) {
+ Zero = SDValue(
+ CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+ 0);
+ }
+
+ // Copy flags to the EFLAGS register and glue it to next node.
+ unsigned Opcode = N->getOpcode();
+ assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
+ "Unexpected opcode for SBB materialization");
+ unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ N->getOperand(FlagOpIndex), SDValue());
+
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
+ MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ VTs = CurDAG->getVTList(SBBVT, MVT::i32);
+ return SDValue(
+ CurDAG->getMachineNode(Opc, dl, VTs,
+ {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
+ 0);
+ }
+
// Helper to detect unneeded and instructions on shift amounts. Called
// from PatFrags in tablegen.
bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
@@ -476,6 +510,9 @@ namespace {
return Subtarget->getInstrInfo();
}
+ /// Return a condition code of the given SDNode
+ X86::CondCode getCondFromNode(SDNode *N) const;
+
/// Address-mode matching performs shift-of-and to and-of-shift
/// reassociation in order to expose more scaled addressing
/// opportunities.
@@ -492,7 +529,7 @@ namespace {
unsigned StoreSize = N->getMemoryVT().getStoreSize();
- if (N->getAlignment() < StoreSize)
+ if (N->getAlign().value() < StoreSize)
return false;
switch (StoreSize) {
@@ -2391,6 +2428,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
return false;
break;
+ case ISD::XOR:
+ // We want to look through a transform in InstCombine that
+ // turns 'add' with min_signed_val into 'xor', so we can treat this 'xor'
+ // exactly like an 'add'.
+ if (isMinSignedConstant(N.getOperand(1)) && !matchAdd(N, AM, Depth))
+ return false;
+ break;
+
case ISD::AND: {
// Perform some heroic transforms on an and of a constant-count shift
// with a constant to enable use of the scaled offset field.
@@ -2745,10 +2790,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
case X86ISD::SUB:
case X86ISD::ADC:
case X86ISD::SBB:
- /* TODO: These opcodes can be added safely, but we may want to justify
- their inclusion for different reasons (better for reg-alloc).
case X86ISD::SMUL:
case X86ISD::UMUL:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for different reasons (better for reg-alloc).
case X86ISD::OR:
case X86ISD::XOR:
case X86ISD::AND:
@@ -2759,10 +2804,9 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
return false;
}
};
- // TODO: This could be an 'or' rather than 'and' to make the transform more
- // likely to happen. We might want to factor in whether there's a
- // load folding opportunity for the math op that disappears with LEA.
- if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+ // TODO: We might want to factor in whether there's a load folding
+ // opportunity for the math op that disappears with LEA.
+ if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
Complexity++;
}
@@ -2891,24 +2935,15 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
CR->getSignedMax().slt(1ull << Width);
}
-static X86::CondCode getCondFromNode(SDNode *N) {
+X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
assert(N->isMachineOpcode() && "Unexpected node");
- X86::CondCode CC = X86::COND_INVALID;
unsigned Opc = N->getMachineOpcode();
- if (Opc == X86::JCC_1)
- CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
- else if (Opc == X86::SETCCr)
- CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
- else if (Opc == X86::SETCCm)
- CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
- else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
- Opc == X86::CMOV64rr)
- CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
- else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
- Opc == X86::CMOV64rm)
- CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
-
- return CC;
+ const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
+ int CondNo = X86::getCondSrcNoFromDesc(MCID);
+ if (CondNo < 0)
+ return X86::COND_INVALID;
+
+ return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
}
/// Test whether the given X86ISD::CMP node has any users that use a flag
@@ -3464,7 +3499,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
auto checkUses = [AllowExtraUsesByDefault](SDValue Op, unsigned NUses,
Optional<bool> AllowExtraUses) {
- return AllowExtraUses.getValueOr(AllowExtraUsesByDefault) ||
+ return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
};
auto checkOneUse = [checkUses](SDValue Op,
@@ -5478,7 +5513,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
MVT CmpVT = N0.getSimpleValueType();
// Floating point needs special handling if we don't have FCOMI.
- if (Subtarget->hasCMov())
+ if (Subtarget->canUseCMOV())
break;
bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
@@ -5518,7 +5553,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Move AH into flags.
// Some 64-bit targets lack SAHF support, but they do support FCOMI.
- assert(Subtarget->hasLAHFSAHF() &&
+ assert(Subtarget->canUseLAHFSAHF() &&
"Target doesn't support SAHF or FCOMI?");
SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
Chain = AH;
@@ -5567,40 +5602,86 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
// use a smaller encoding.
// Look past the truncate if CMP is the only use of it.
- if (N0.getOpcode() == ISD::AND &&
- N0.getNode()->hasOneUse() &&
+ if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (!C) break;
- uint64_t Mask = C->getZExtValue();
+ auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!MaskC)
+ break;
+
// We may have looked through a truncate so mask off any bits that
// shouldn't be part of the compare.
+ uint64_t Mask = MaskC->getZExtValue();
Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
- // Check if we can replace AND+IMM64 with a shift. This is possible for
- // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
- // flag.
- if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
+ // Check if we can replace AND+IMM{32,64} with a shift. This is possible
+ // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
+ // zero flag.
+ if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
onlyUsesZeroFlag(SDValue(Node, 0))) {
- if (isMask_64(~Mask)) {
- unsigned TrailingZeros = countTrailingZeros(Mask);
- SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
- SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
- N0.getOperand(0), Imm), 0);
- MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
- MVT::i32, Shift, Shift);
- ReplaceNode(Node, Test);
- return;
+ unsigned ShiftOpcode = ISD::DELETED_NODE;
+ unsigned ShiftAmt;
+ unsigned SubRegIdx;
+ MVT SubRegVT;
+ unsigned TestOpcode;
+ unsigned LeadingZeros = countLeadingZeros(Mask);
+ unsigned TrailingZeros = countTrailingZeros(Mask);
+
+ // With leading/trailing zeros, the transform is profitable if we can
+ // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
+ // incurring any extra register moves.
+ bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
+ if (LeadingZeros == 0 && SavesBytes) {
+ // If the mask covers the most significant bit, then we can replace
+ // TEST+AND with a SHR and check eflags.
+ // This emits a redundant TEST which is subsequently eliminated.
+ ShiftOpcode = X86::SHR64ri;
+ ShiftAmt = TrailingZeros;
+ SubRegIdx = 0;
+ TestOpcode = X86::TEST64rr;
+ } else if (TrailingZeros == 0 && SavesBytes) {
+ // If the mask covers the least significant bit, then we can replace
+ // TEST+AND with a SHL and check eflags.
+ // This emits a redundant TEST which is subsequently eliminated.
+ ShiftOpcode = X86::SHL64ri;
+ ShiftAmt = LeadingZeros;
+ SubRegIdx = 0;
+ TestOpcode = X86::TEST64rr;
+ } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
+ // If the shifted mask extends into the high half and is 8/16/32 bits
+ // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
+ unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
+ if (PopCount == 8) {
+ ShiftOpcode = X86::SHR64ri;
+ ShiftAmt = TrailingZeros;
+ SubRegIdx = X86::sub_8bit;
+ SubRegVT = MVT::i8;
+ TestOpcode = X86::TEST8rr;
+ } else if (PopCount == 16) {
+ ShiftOpcode = X86::SHR64ri;
+ ShiftAmt = TrailingZeros;
+ SubRegIdx = X86::sub_16bit;
+ SubRegVT = MVT::i16;
+ TestOpcode = X86::TEST16rr;
+ } else if (PopCount == 32) {
+ ShiftOpcode = X86::SHR64ri;
+ ShiftAmt = TrailingZeros;
+ SubRegIdx = X86::sub_32bit;
+ SubRegVT = MVT::i32;
+ TestOpcode = X86::TEST32rr;
+ }
}
- if (isMask_64(Mask)) {
- unsigned LeadingZeros = countLeadingZeros(Mask);
- SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
- SDValue Shift =
- SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
- N0.getOperand(0), Imm), 0);
- MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
- MVT::i32, Shift, Shift);
+ if (ShiftOpcode != ISD::DELETED_NODE) {
+ SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
+ SDValue Shift = SDValue(
+ CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
+ N0.getOperand(0), ShiftC),
+ 0);
+ if (SubRegIdx != 0) {
+ Shift =
+ CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
+ }
+ MachineSDNode *Test =
+ CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
ReplaceNode(Node, Test);
return;
}
@@ -5769,21 +5850,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
case X86ISD::SETCC_CARRY: {
- // We have to do this manually because tblgen will put the eflags copy in
- // the wrong place if we use an extract_subreg in the pattern.
MVT VT = Node->getSimpleValueType(0);
+ SDValue Result;
+ if (Subtarget->hasSBBDepBreaking()) {
+ // We have to do this manually because tblgen will put the eflags copy in
+ // the wrong place if we use an extract_subreg in the pattern.
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(1), SDValue());
- // Copy flags to the EFLAGS register and glue it to next node.
- SDValue EFLAGS =
- CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
- Node->getOperand(1), SDValue());
-
- // Create a 64-bit instruction if the result is 64-bits otherwise use the
- // 32-bit version.
- unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
- MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
- SDValue Result = SDValue(
- CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+ MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ Result = SDValue(
+ CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
+ 0);
+ } else {
+ // The target does not recognize sbb with the same reg operand as a
+ // no-source idiom, so we explicitly zero the input values.
+ Result = getSBBZero(Node);
+ }
// For less than 32-bits we need to extract from the 32-bit node.
if (VT == MVT::i8 || VT == MVT::i16) {
@@ -5798,35 +5886,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case X86ISD::SBB: {
if (isNullConstant(Node->getOperand(0)) &&
isNullConstant(Node->getOperand(1))) {
- MVT VT = Node->getSimpleValueType(0);
-
- // Create zero.
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
- SDValue Zero =
- SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
- if (VT == MVT::i64) {
- Zero = SDValue(
- CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
- CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
- CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
- 0);
- }
-
- // Copy flags to the EFLAGS register and glue it to next node.
- SDValue EFLAGS =
- CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
- Node->getOperand(2), SDValue());
-
- // Create a 64-bit instruction if the result is 64-bits otherwise use the
- // 32-bit version.
- unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
- MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
- VTs = CurDAG->getVTList(SBBVT, MVT::i32);
- SDValue Result =
- SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
- EFLAGS.getValue(1)}),
- 0);
+ SDValue Result = getSBBZero(Node);
// Replace the flag use.
ReplaceUses(SDValue(Node, 1), Result.getValue(1));
@@ -5834,6 +5894,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Replace the result use.
if (!SDValue(Node, 0).use_empty()) {
// For less than 32-bits we need to extract from the 32-bit node.
+ MVT VT = Node->getSimpleValueType(0);
if (VT == MVT::i8 || VT == MVT::i16) {
int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
@@ -6112,6 +6173,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
case InlineAsm::Constraint_v: // not offsetable ??
case InlineAsm::Constraint_m: // memory
case InlineAsm::Constraint_X:
+ case InlineAsm::Constraint_p: // address
if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
return true;
break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90753b5b4d33..61c1fd25031d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
- X86ScalarSSEf64 = Subtarget.hasSSE2();
- X86ScalarSSEf32 = Subtarget.hasSSE1();
- X86ScalarSSEf16 = Subtarget.hasFP16();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
@@ -170,7 +167,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
// FIXME: Should we be limiting the atomic size on other configs? Default is
// 1024.
- if (!Subtarget.hasCmpxchg8b())
+ if (!Subtarget.canUseCMPXCHG8B())
setMaxAtomicSizeInBitsSupported(32);
// Set up the register classes.
@@ -200,7 +197,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Integer absolute.
- if (Subtarget.hasCMov()) {
+ if (Subtarget.canUseCMOV()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
if (Subtarget.is64Bit())
@@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
- if (!X86ScalarSSEf64) {
+ if (!Subtarget.hasSSE2()) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
if (Subtarget.is64Bit()) {
@@ -415,14 +412,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(Op, MVT::f128, Expand);
}
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f80, MVT::f16, Expand);
- setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+ for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
+ setTruncStoreAction(VT, MVT::f16, Expand);
+ setTruncStoreAction(VT, MVT::bf16, Expand);
+
+ setOperationAction(ISD::BF16_TO_FP, VT, Expand);
+ setOperationAction(ISD::FP_TO_BF16, VT, Expand);
+ }
setOperationAction(ISD::PARITY, MVT::i8, Custom);
setOperationAction(ISD::PARITY, MVT::i16, Custom);
@@ -497,7 +495,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, VT, Custom);
}
- if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
+ if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
@@ -516,9 +514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
- if (Subtarget.hasCmpxchg16b()) {
+ if (Subtarget.canUseCMPXCHG16B())
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
- }
// FIXME - use subtarget debug flags
if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
@@ -535,7 +532,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
- if (Subtarget.getTargetTriple().isPS4CPU())
+ if (Subtarget.isTargetPS())
setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
else
setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
@@ -556,9 +553,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
- if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
- // f32 and f64 use SSE.
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
+ // f16, f32 and f64 use SSE.
// Set up the FP register classes.
+ addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
+ : &X86::FR16RegClass);
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
: &X86::FR32RegClass);
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
@@ -590,11 +591,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSINCOS, VT, Expand);
}
+ // Half type will be promoted by default.
+ setOperationAction(ISD::FABS, MVT::f16, Promote);
+ setOperationAction(ISD::FNEG, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+ setOperationAction(ISD::FADD, MVT::f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FMA, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SETCC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT, MVT::f16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+ setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
+ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
// Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
- } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+ } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
(UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
@@ -664,6 +708,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
+ // Support fp16 0 immediate.
+ if (isTypeLegal(MVT::f16))
+ addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
+
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
@@ -673,7 +721,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
@@ -725,7 +772,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+ if (isTypeLegal(MVT::f16)) {
+ setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
+ } else {
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+ }
// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
// as Custom.
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
@@ -877,7 +929,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
// split/scalarized right now.
- if (VT.getVectorElementType() == MVT::f16)
+ if (VT.getVectorElementType() == MVT::f16 ||
+ VT.getVectorElementType() == MVT::bf16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
@@ -949,6 +1002,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
+ setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
@@ -1067,6 +1122,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
+ // Add 32-bit vector stores to help vectorization opportunities.
+ setOperationAction(ISD::STORE, MVT::v2i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i8, Custom);
+
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
@@ -1285,13 +1344,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT == MVT::v4i64) continue;
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Custom);
+ setOperationAction(ISD::FSHR, VT, Custom);
}
- setOperationAction(ISD::FSHL, MVT::v32i8, Custom);
- setOperationAction(ISD::FSHR, MVT::v32i8, Custom);
- setOperationAction(ISD::FSHL, MVT::v8i32, Custom);
- setOperationAction(ISD::FSHR, MVT::v8i32, Custom);
-
// These types need custom splitting if their input is a 128-bit vector.
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
@@ -1353,6 +1409,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
@@ -1446,6 +1504,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+ }
+
// This block controls legalization of the mask vector sizes that are
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
@@ -1652,6 +1717,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
@@ -1698,6 +1765,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
+ setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
+ setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
@@ -1970,10 +2039,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
- if (isTypeLegal(MVT::f80)) {
- setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
- }
setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
@@ -2059,9 +2124,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
setOperationAction(ISD::STORE, MVT::v4f16, Custom);
}
-
- // Support fp16 0 immediate
- addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@@ -2209,55 +2271,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(Op, MVT::f32, Promote);
// We have target-specific dag combine patterns for the following nodes:
- setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
- setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::CONCAT_VECTORS);
- setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
- setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
- setTargetDAGCombine(ISD::BITCAST);
- setTargetDAGCombine(ISD::VSELECT);
- setTargetDAGCombine(ISD::SELECT);
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::SRL);
- setTargetDAGCombine(ISD::OR);
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FSUB);
- setTargetDAGCombine(ISD::FNEG);
- setTargetDAGCombine(ISD::FMA);
- setTargetDAGCombine(ISD::STRICT_FMA);
- setTargetDAGCombine(ISD::FMINNUM);
- setTargetDAGCombine(ISD::FMAXNUM);
- setTargetDAGCombine(ISD::SUB);
- setTargetDAGCombine(ISD::LOAD);
- setTargetDAGCombine(ISD::MLOAD);
- setTargetDAGCombine(ISD::STORE);
- setTargetDAGCombine(ISD::MSTORE);
- setTargetDAGCombine(ISD::TRUNCATE);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::SIGN_EXTEND);
- setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
- setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
- setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
- setTargetDAGCombine(ISD::SINT_TO_FP);
- setTargetDAGCombine(ISD::UINT_TO_FP);
- setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
- setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
- setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::XOR);
- setTargetDAGCombine(ISD::MSCATTER);
- setTargetDAGCombine(ISD::MGATHER);
- setTargetDAGCombine(ISD::FP16_TO_FP);
- setTargetDAGCombine(ISD::FP_EXTEND);
- setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
- setTargetDAGCombine(ISD::FP_ROUND);
+ setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
+ ISD::SCALAR_TO_VECTOR,
+ ISD::INSERT_VECTOR_ELT,
+ ISD::EXTRACT_VECTOR_ELT,
+ ISD::CONCAT_VECTORS,
+ ISD::INSERT_SUBVECTOR,
+ ISD::EXTRACT_SUBVECTOR,
+ ISD::BITCAST,
+ ISD::VSELECT,
+ ISD::SELECT,
+ ISD::SHL,
+ ISD::SRA,
+ ISD::SRL,
+ ISD::OR,
+ ISD::AND,
+ ISD::ADD,
+ ISD::FADD,
+ ISD::FSUB,
+ ISD::FNEG,
+ ISD::FMA,
+ ISD::STRICT_FMA,
+ ISD::FMINNUM,
+ ISD::FMAXNUM,
+ ISD::SUB,
+ ISD::LOAD,
+ ISD::MLOAD,
+ ISD::STORE,
+ ISD::MSTORE,
+ ISD::TRUNCATE,
+ ISD::ZERO_EXTEND,
+ ISD::ANY_EXTEND,
+ ISD::SIGN_EXTEND,
+ ISD::SIGN_EXTEND_INREG,
+ ISD::ANY_EXTEND_VECTOR_INREG,
+ ISD::SIGN_EXTEND_VECTOR_INREG,
+ ISD::ZERO_EXTEND_VECTOR_INREG,
+ ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP,
+ ISD::STRICT_SINT_TO_FP,
+ ISD::STRICT_UINT_TO_FP,
+ ISD::SETCC,
+ ISD::MUL,
+ ISD::XOR,
+ ISD::MSCATTER,
+ ISD::MGATHER,
+ ISD::FP16_TO_FP,
+ ISD::FP_EXTEND,
+ ISD::STRICT_FP_EXTEND,
+ ISD::FP_ROUND});
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -2568,9 +2630,9 @@ EVT X86TargetLowering::getOptimalMemOpType(
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
- return X86ScalarSSEf32;
+ return Subtarget.hasSSE1();
if (VT == MVT::f64)
- return X86ScalarSSEf64;
+ return Subtarget.hasSSE2();
return true;
}
@@ -3566,10 +3628,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
MFI.setObjectSExt(FI, true);
}
+ MaybeAlign Alignment;
+ if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
+ ValVT != MVT::f80)
+ Alignment = MaybeAlign(4);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ Alignment);
return ExtendedInMem
? (VA.getValVT().isVector()
? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
@@ -3906,7 +3973,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
else if (RegVT == MVT::f16)
- RC = &X86::FR16XRegClass;
+ RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
else if (RegVT == MVT::f32)
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
else if (RegVT == MVT::f64)
@@ -4088,9 +4155,14 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
if (isByVal)
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+ MaybeAlign Alignment;
+ if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
+ Arg.getSimpleValueType() != MVT::f80)
+ Alignment = MaybeAlign(4);
return DAG.getStore(
Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+ Alignment);
}
/// Emit a load of return address if tail call
@@ -5076,7 +5148,7 @@ bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
// If this is an unaligned vector, make sure the target supports folding it.
auto *Ld = cast<LoadSDNode>(Op.getNode());
if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
- Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
+ Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
return false;
// TODO: If this is a non-temporal load and the target has an instruction
@@ -5171,13 +5243,6 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
}
}
-static bool isTargetShuffleSplat(SDValue Op) {
- unsigned Opcode = Op.getOpcode();
- if (Opcode == ISD::EXTRACT_SUBVECTOR)
- return isTargetShuffleSplat(Op.getOperand(0));
- return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
-}
-
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -5429,6 +5494,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOLoad;
return true;
+ case Intrinsic::x86_atomic_bts:
+ case Intrinsic::x86_atomic_btc:
+ case Intrinsic::x86_atomic_btr: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ unsigned Size = I.getType()->getScalarSizeInBits();
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
+ Info.align = Align(Size);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
}
return false;
}
@@ -5643,6 +5720,22 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
+bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
+ return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+}
+
+bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !Subtarget.hasSSE2() || VT == MVT::f80;
+}
+
+bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
+ (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
+}
+
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
@@ -5755,6 +5848,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
+ // TODO: Should we always create i64 masks? Or only folded immediates?
EVT VT = N->getValueType(0);
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
@@ -6281,7 +6375,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
// Helper function to collect subvector ops that are concatenated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
-static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
+ SelectionDAG &DAG) {
assert(Ops.empty() && "Expected an empty ops vector");
if (N->getOpcode() == ISD::CONCAT_VECTORS) {
@@ -6297,21 +6392,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
EVT SubVT = Sub.getValueType();
// TODO - Handle more general insert_subvector chains.
- if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
- Idx == (VT.getVectorNumElements() / 2)) {
- // insert_subvector(insert_subvector(undef, x, lo), y, hi)
- if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(1).getValueType() == SubVT &&
- isNullConstant(Src.getOperand(2))) {
- Ops.push_back(Src.getOperand(1));
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
+ // insert_subvector(undef, x, lo)
+ if (Idx == 0 && Src.isUndef()) {
Ops.push_back(Sub);
+ Ops.push_back(DAG.getUNDEF(SubVT));
return true;
}
- // insert_subvector(x, extract_subvector(x, lo), hi)
- if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
- Ops.append(2, Sub);
- return true;
+ if (Idx == (VT.getVectorNumElements() / 2)) {
+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ // insert_subvector(x, extract_subvector(x, lo), hi)
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+ Ops.append(2, Sub);
+ return true;
+ }
+ // insert_subvector(undef, x, hi)
+ if (Src.isUndef()) {
+ Ops.push_back(DAG.getUNDEF(SubVT));
+ Ops.push_back(Sub);
+ return true;
+ }
}
}
}
@@ -6770,7 +6878,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
SmallVector<SDValue, 2> CatOps;
- if (collectConcatOps(V.getNode(), CatOps)) {
+ if (collectConcatOps(V.getNode(), CatOps, DAG)) {
for (SDValue &CatOp : CatOps) {
SDValue NotCat = IsNOT(CatOp, DAG);
if (!NotCat) return SDValue();
@@ -7934,8 +8042,35 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
}
}
+// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
+static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
+ SDValue Cond, bool IsBLENDV = false) {
+ EVT CondVT = Cond.getValueType();
+ unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+ unsigned NumElts = CondVT.getVectorNumElements();
+
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+ true, false))
+ return false;
+
+ Mask.resize(NumElts, SM_SentinelUndef);
+
+ for (int i = 0; i != (int)NumElts; ++i) {
+ Mask[i] = i;
+ // Arbitrarily choose from the 2nd operand if the select condition element
+ // is undef.
+ // TODO: Can we do better by matching patterns such as even/odd?
+ if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
+ (IsBLENDV && EltBits[i].isNonNegative()))
+ Mask[i] += NumElts;
+ }
+
+ return true;
+}
+
// Forward declaration (for getFauxShuffleMask recursive check).
-// TODO: Use DemandedElts variant.
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
const SelectionDAG &DAG, unsigned Depth,
@@ -7987,11 +8122,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
uint64_t ZeroMask = IsAndN ? 255 : 0;
if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
+ // We can't assume an undef src element gives an undef dst - the other src
+ // might be zero.
+ if (!UndefElts.isZero())
+ return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
- if (UndefElts[i]) {
- Mask.push_back(SM_SentinelUndef);
- continue;
- }
const APInt &ByteBits = EltBits[i];
if (ByteBits != 0 && ByteBits != 255)
return false;
@@ -8240,6 +8375,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
return true;
}
+ case ISD::VSELECT:
+ case X86ISD::BLENDV: {
+ SDValue Cond = N.getOperand(0);
+ if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
+ Ops.push_back(N.getOperand(1));
+ Ops.push_back(N.getOperand(2));
+ return true;
+ }
+ return false;
+ }
case X86ISD::VTRUNC: {
SDValue Src = N.getOperand(0);
EVT SrcVT = Src.getValueType();
@@ -9076,7 +9221,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
// will lower to regular temporal loads and use the cache.
- if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+ if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
@@ -9462,7 +9607,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
- if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ if (ScalarSize == 32 ||
+ (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
(ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
@@ -11651,33 +11797,6 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
return true;
}
-// Attempt to create a shuffle mask from a VSELECT condition mask.
-static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
- SDValue Cond) {
- EVT CondVT = Cond.getValueType();
- unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
- unsigned NumElts = CondVT.getVectorNumElements();
-
- APInt UndefElts;
- SmallVector<APInt, 32> EltBits;
- if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
- true, false))
- return false;
-
- Mask.resize(NumElts, SM_SentinelUndef);
-
- for (int i = 0; i != (int)NumElts; ++i) {
- Mask[i] = i;
- // Arbitrarily choose from the 2nd operand if the select condition element
- // is undef.
- // TODO: Can we do better by matching patterns such as even/odd?
- if (UndefElts[i] || EltBits[i].isZero())
- Mask[i] += NumElts;
- }
-
- return true;
-}
-
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
// instructions.
static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
@@ -13943,8 +14062,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
static bool isShuffleFoldableLoad(SDValue V) {
- V = peekThroughBitcasts(V);
- return ISD::isNON_EXTLoad(V.getNode());
+ return V->hasOneUse() &&
+ ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
}
/// Try to lower insertion of a single element into a zero vector.
@@ -15796,7 +15915,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1 = extract128BitVector(V1V2, 0, DAG, DL);
V2 = extract128BitVector(V1V2, 4, DAG, DL);
} else {
- SmallVector<SDValue> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+ SmallVector<SDValue, 4> DWordClearOps(4,
+ DAG.getConstant(0, DL, MVT::i32));
for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
SDValue DWordClearMask =
@@ -16615,9 +16735,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
// otherwise we're (probably) better off doing a split.
if (VT == MVT::v4f64 &&
!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
- if (SDValue V =
- lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
- return V;
+ return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
@@ -17229,114 +17347,135 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
return SDValue();
// Bail if we already have a repeated lane shuffle mask.
- SmallVector<int, 8> RepeatedShuffleMask;
- if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
- // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
- // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
- int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
- int NumSubLanes = NumLanes * SubLaneScale;
- int NumSubLaneElts = NumLaneElts / SubLaneScale;
-
- // Check that all the sources are coming from the same lane and see if we can
- // form a repeating shuffle mask (local to each sub-lane). At the same time,
- // determine the source sub-lane for each destination sub-lane.
- int TopSrcSubLane = -1;
- SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
- SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
- SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
- SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
-
- for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
- // Extract the sub-lane mask, check that it all comes from the same lane
- // and normalize the mask entries to come from the first lane.
- int SrcLane = -1;
- SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
- for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
- int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
- if (M < 0)
+ // Helper to look for repeated mask in each split sublane, and that those
+ // sublanes can then be permuted into place.
+ auto ShuffleSubLanes = [&](int SubLaneScale) {
+ int NumSubLanes = NumLanes * SubLaneScale;
+ int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+ // Check that all the sources are coming from the same lane and see if we
+ // can form a repeating shuffle mask (local to each sub-lane). At the same
+ // time, determine the source sub-lane for each destination sub-lane.
+ int TopSrcSubLane = -1;
+ SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+ SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
+ SubLaneScale,
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
+
+ for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+ // Extract the sub-lane mask, check that it all comes from the same lane
+ // and normalize the mask entries to come from the first lane.
+ int SrcLane = -1;
+ SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumLaneElts;
+ if ((0 <= SrcLane) && (SrcLane != Lane))
+ return SDValue();
+ SrcLane = Lane;
+ int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+ SubLaneMask[Elt] = LocalM;
+ }
+
+ // Whole sub-lane is UNDEF.
+ if (SrcLane < 0)
continue;
- int Lane = (M % NumElts) / NumLaneElts;
- if ((0 <= SrcLane) && (SrcLane != Lane))
- return SDValue();
- SrcLane = Lane;
- int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
- SubLaneMask[Elt] = LocalM;
- }
- // Whole sub-lane is UNDEF.
- if (SrcLane < 0)
- continue;
+ // Attempt to match against the candidate repeated sub-lane masks.
+ for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+ auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+ for (int i = 0; i != NumSubLaneElts; ++i) {
+ if (M1[i] < 0 || M2[i] < 0)
+ continue;
+ if (M1[i] != M2[i])
+ return false;
+ }
+ return true;
+ };
- // Attempt to match against the candidate repeated sub-lane masks.
- for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
- auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+ if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+ continue;
+
+ // Merge the sub-lane mask into the matching repeated sub-lane mask.
for (int i = 0; i != NumSubLaneElts; ++i) {
- if (M1[i] < 0 || M2[i] < 0)
+ int M = SubLaneMask[i];
+ if (M < 0)
continue;
- if (M1[i] != M2[i])
- return false;
+ assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+ "Unexpected mask element");
+ RepeatedSubLaneMask[i] = M;
}
- return true;
- };
- auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
- if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
- continue;
+ // Track the top most source sub-lane - by setting the remaining to
+ // UNDEF we can greatly simplify shuffle matching.
+ int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+ TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+ Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+ break;
+ }
+
+ // Bail if we failed to find a matching repeated sub-lane mask.
+ if (Dst2SrcSubLanes[DstSubLane] < 0)
+ return SDValue();
+ }
+ assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+ "Unexpected source lane");
- // Merge the sub-lane mask into the matching repeated sub-lane mask.
- for (int i = 0; i != NumSubLaneElts; ++i) {
- int M = SubLaneMask[i];
+ // Create a repeating shuffle mask for the entire vector.
+ SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+ for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+ int Lane = SubLane / SubLaneScale;
+ auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+ for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+ int M = RepeatedSubLaneMask[Elt];
if (M < 0)
continue;
- assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
- "Unexpected mask element");
- RepeatedSubLaneMask[i] = M;
+ int Idx = (SubLane * NumSubLaneElts) + Elt;
+ RepeatedMask[Idx] = M + (Lane * NumLaneElts);
}
-
- // Track the top most source sub-lane - by setting the remaining to UNDEF
- // we can greatly simplify shuffle matching.
- int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
- TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
- Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
- break;
}
+ SDValue RepeatedShuffle =
+ DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
- // Bail if we failed to find a matching repeated sub-lane mask.
- if (Dst2SrcSubLanes[DstSubLane] < 0)
- return SDValue();
- }
- assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
- "Unexpected source lane");
-
- // Create a repeating shuffle mask for the entire vector.
- SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
- for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
- int Lane = SubLane / SubLaneScale;
- auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
- for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
- int M = RepeatedSubLaneMask[Elt];
- if (M < 0)
+ // Shuffle each source sub-lane to its destination.
+ SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+ int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+ if (SrcSubLane < 0)
continue;
- int Idx = (SubLane * NumSubLaneElts) + Elt;
- RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+ for (int j = 0; j != NumSubLaneElts; ++j)
+ SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
}
- }
- SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
- // Shuffle each source sub-lane to its destination.
- SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
- for (int i = 0; i != NumElts; i += NumSubLaneElts) {
- int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
- if (SrcSubLane < 0)
- continue;
- for (int j = 0; j != NumSubLaneElts; ++j)
- SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
- }
+ return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+ SubLaneMask);
+ };
- return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
- SubLaneMask);
+ // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+ // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
+ // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
+ // Otherwise we can only permute whole 128-bit lanes.
+ int MinSubLaneScale = 1, MaxSubLaneScale = 1;
+ if (Subtarget.hasAVX2() && VT.is256BitVector()) {
+ bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
+ MinSubLaneScale = 2;
+ MaxSubLaneScale =
+ (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
+ }
+ if (Subtarget.hasBWI() && VT == MVT::v64i8)
+ MinSubLaneScale = MaxSubLaneScale = 4;
+
+ for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
+ if (SDValue Shuffle = ShuffleSubLanes(Scale))
+ return Shuffle;
+
+ return SDValue();
}
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
@@ -17513,6 +17652,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Op;
+ bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
+ bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+
// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
@@ -17521,13 +17663,11 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
(V2.getOpcode() != ISD::BUILD_VECTOR))
- if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
- Mask, DAG))
- return Op;
+ return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
// If we have one input in place, then we can permute the other input and
// blend the result.
- if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ if (V1IsInPlace || V2IsInPlace)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
@@ -17541,8 +17681,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
- if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
- isShuffleMaskInputInPlace(1, Mask))))
+ if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -17635,9 +17774,12 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
+ bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
+ bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+
// If we have one input in place, then we can permute the other input and
// blend the result.
- if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
+ if (V1IsInPlace || V2IsInPlace)
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
@@ -17647,12 +17789,16 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return V;
+ // Try to lower to PERMQ(BLENDD(V1,V2)).
+ if (SDValue V =
+ lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return V;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
- if (!isShuffleMaskInputInPlace(0, Mask) &&
- !isShuffleMaskInputInPlace(1, Mask))
+ if (!V1IsInPlace && !V2IsInPlace)
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
@@ -18657,20 +18803,34 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // VBMI can use VPERMV/VPERMV3 byte shuffles.
- if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
-
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
+ DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
+ return Result;
+
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
+ if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
+ // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
+ // PALIGNR will be cheaper than the second PSHUFB+OR.
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ bool V1InUse, V2InUse;
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
+ DAG, V1InUse, V2InUse);
+ }
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (!V2.isUndef())
@@ -18678,7 +18838,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Result;
- // FIXME: Implement direct support for this type!
+ // VBMI can use VPERMV/VPERMV3 byte shuffles.
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
+
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -18915,7 +19078,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Offset += NumElts; // Increment for next iteration.
}
-
+ // If we're broadcasting a SETCC result, try to broadcast the ops instead.
+ // TODO: What other unary shuffles would benefit from this?
+ if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
+ V1->hasOneUse()) {
+ SDValue Op0 = V1.getOperand(0);
+ SDValue Op1 = V1.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
+ EVT OpVT = Op0.getValueType();
+ return DAG.getSetCC(
+ DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
+ DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
+ }
MVT ExtVT;
switch (VT.SimpleTy) {
@@ -19619,9 +19793,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
if (IsZeroElt || IsAllOnesElt) {
- // Lower insertion of i8 -1 as an 'OR' blend.
+ // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
// We don't deal with i8 0 since it appears to be handled elsewhere.
- if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
+ if (IsAllOnesElt &&
+ ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
+ ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
@@ -19652,7 +19828,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// and incur a domain crossing penalty if that's what we'll end up
// doing anyway after extracting to a 128-bit vector.
if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
- (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
+ (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
DAG.getTargetConstant(1, dl, MVT::i8));
@@ -19666,7 +19842,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// If we are not inserting into the low 128-bit vector chunk,
// then prefer the broadcast+blend sequence.
// FIXME: relax the profitability check iff all N1 uses are insertions.
- if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
+ if (IdxVal >= NumEltsIn128 &&
((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
X86::mayFoldLoad(N1, Subtarget)))) {
@@ -20617,6 +20793,35 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
return Cvt;
}
+template<typename T>
+static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
+ return VT == MVT::f16 && !Subtarget.hasFP16();
+}
+
+template<typename T>
+bool X86TargetLowering::isSoftFP16(T VT) const {
+ return ::isSoftFP16(VT, Subtarget);
+}
+
+static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
+ MVT VT = Op.getSimpleValueType();
+ MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
+ SDLoc dl(Op);
+
+ SDValue Rnd = DAG.getIntPtrConstant(0, dl);
+ if (IsStrict)
+ return DAG.getNode(
+ ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+ {Chain,
+ DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
+ Rnd});
+ return DAG.getNode(ISD::FP_ROUND, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20627,6 +20832,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (isSoftFP16(VT))
+ return promoteXINT_TO_FP(Op, DAG);
+
if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
return LowerWin64_INT128_TO_FP(Op, DAG);
@@ -21123,9 +21331,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MVT DstVT = Op->getSimpleValueType(0);
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+ // Bail out when we don't have native conversion instructions.
if (DstVT == MVT::f128)
return SDValue();
+ if (isSoftFP16(DstVT))
+ return promoteXINT_TO_FP(Op, DAG);
+
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -21158,9 +21370,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// The transform for i64->f64 isn't correct for 0 when rounding to negative
// infinity. It produces -0.0, so disable under strictfp.
- if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
+ !IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
- if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
+ // The transform for i32->f64/f32 isn't correct for 0 when rounding to
+ // negative infinity. So disable under strictfp. Using FILD instead.
+ if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
+ !IsStrict)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
(DstVT == MVT::f32 || DstVT == MVT::f64))
@@ -21819,27 +22035,25 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
- In = DAG.getBitcast(MVT::v8i32, In);
-
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
DAG.getIntPtrConstant(0, DL));
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(4, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(2, DL));
static const int ShufMask[] = {0, 2, 4, 6};
- return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+ return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
+ DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
- In = DAG.getBitcast(MVT::v32i8, In);
-
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
// The PSHUFB mask:
@@ -21847,27 +22061,30 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
-1, -1, -1, -1, -1, -1, -1, -1,
16, 17, 20, 21, 24, 25, 28, 29,
-1, -1, -1, -1, -1, -1, -1, -1 };
+ In = DAG.getBitcast(MVT::v32i8, In);
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
static const int ShufMask2[] = {0, 2, -1, -1};
In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
- DAG.getBitcast(MVT::v16i16, In),
- DAG.getIntPtrConstant(0, DL));
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(MVT::v8i16, In);
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
- DAG.getIntPtrConstant(16, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
// The PSHUFB mask:
- static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
- -1, -1, -1, -1, -1, -1, -1, -1};
+ static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+ OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
+ OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
- OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
+ OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -21941,6 +22158,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Res;
+ if (isSoftFP16(SrcVT)) {
+ MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
+ if (IsStrict)
+ return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+ {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+ {NVT, MVT::Other}, {Chain, Src})});
+ return DAG.getNode(Op.getOpcode(), dl, VT,
+ DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
+ }
+
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
@@ -22278,6 +22505,9 @@ SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT == MVT::f16)
+ return SDValue();
+
// If the source is in an SSE register, the node is Legal.
if (isScalarFPTypeInSSEReg(SrcVT))
return Op;
@@ -22349,7 +22579,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
// This code is only for floats and doubles. Fall back to generic code for
// anything else.
- if (!isScalarFPTypeInSSEReg(SrcVT))
+ if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
return SDValue();
EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -22381,11 +22611,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
// floating-point values.
APInt MinInt, MaxInt;
if (IsSigned) {
- MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
- MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+ MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
+ MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
} else {
- MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
- MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+ MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
+ MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
}
APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
@@ -22484,28 +22714,54 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
- if (VT == MVT::f128)
+ if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80))
return SDValue();
- if (VT == MVT::f80) {
- if (SVT == MVT::f16) {
- assert(Subtarget.hasFP16() && "Unexpected features!");
- RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
- MakeLibCallOptions CallOptions;
- std::pair<SDValue, SDValue> Tmp =
- makeLibCall(DAG, LC, VT, In, CallOptions, DL,
- IsStrict ? Op.getOperand(0) : SDValue());
+ if (SVT == MVT::f16) {
+ if (Subtarget.hasFP16())
+ return Op;
+
+ if (VT != MVT::f32) {
if (IsStrict)
- return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
- else
- return Tmp.first;
+ return DAG.getNode(
+ ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
+ {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
+ {MVT::f32, MVT::Other}, {Chain, In})});
+
+ return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+ DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
}
- return Op;
+
+ if (!Subtarget.hasF16C())
+ return SDValue();
+
+ In = DAG.getBitcast(MVT::i16, In);
+ In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
+ getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res;
+ if (IsStrict) {
+ Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
+ {Chain, In});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
+ DAG.getTargetConstant(4, DL, MVT::i32));
+ }
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, DL));
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
}
+ if (!SVT.isVector())
+ return Op;
+
if (SVT.getVectorElementType() == MVT::f16) {
assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
if (SVT == MVT::v2f16)
@@ -22531,15 +22787,65 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
+
+ SDLoc DL(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
- // It's legal except when f128 is involved or we're converting f80->f16.
- if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
- return Op;
+ if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
+ return SDValue();
- return SDValue();
+ if (VT == MVT::f16) {
+ if (Subtarget.hasFP16())
+ return Op;
+
+ if (SVT != MVT::f32) {
+ if (IsStrict)
+ return DAG.getNode(
+ ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
+ {Chain,
+ DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
+ {Chain, In, Op2}),
+ Op2});
+
+ return DAG.getNode(ISD::FP_ROUND, DL, VT,
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
+ Op2);
+ }
+
+ if (!Subtarget.hasF16C())
+ return SDValue();
+
+ SDValue Res;
+ SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
+ MVT::i32);
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
+ DAG.getConstantFP(0, DL, MVT::v4f32), In,
+ DAG.getIntPtrConstant(0, DL));
+ Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
+ {Chain, Res, Rnd});
+ Chain = Res.getValue(1);
+ } else {
+ // FIXME: Should we use zeros for upper elements for non-strict?
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
+ Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
+ DAG.getIntPtrConstant(0, DL));
+ Res = DAG.getBitcast(MVT::f16, Res);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+
+ return Res;
+ }
+
+ return Op;
}
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -22857,6 +23163,47 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
return Res;
}
+/// Helper for attempting to create a X86ISD::BT node.
+static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
+ // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (Src.getValueType().getScalarSizeInBits() < 32)
+ Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
+
+ // No legal type found, give up.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
+ return SDValue();
+
+ // See if we can use the 32-bit instruction instead of the 64-bit one for a
+ // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+ // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+ // known to be zero.
+ if (Src.getValueType() == MVT::i64 &&
+ DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+ Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (Src.getValueType() != BitNo.getValueType()) {
+ // Peek through a mask/modulo operation.
+ // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
+ // we probably need a better IsDesirableToPromoteOp to handle this as well.
+ if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
+ BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
+ DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
+ BitNo.getOperand(0)),
+ DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
+ BitNo.getOperand(1)));
+ else
+ BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
+ }
+
+ return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
+}
+
/// Helper for creating a X86ISD::SETCC node.
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
SelectionDAG &DAG) {
@@ -23303,7 +23650,7 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
return true;
// We never want to use both SQRT and RSQRT instructions for the same input.
- if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
return false;
if (VT.isVector())
@@ -23439,7 +23786,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
// Only perform this transform if CMOV is supported otherwise the select
// below will become a branch.
- if (!Subtarget.hasCMov())
+ if (!Subtarget.canUseCMOV())
return SDValue();
// fold (sdiv X, pow2)
@@ -23485,9 +23832,8 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
/// Result of 'and' is compared against zero. Change to a BT node if possible.
/// Returns the BT node and the condition code needed to use it.
-static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG,
- SDValue &X86CC) {
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
+ SelectionDAG &DAG, X86::CondCode &X86CC) {
assert(And.getOpcode() == ISD::AND && "Expected AND node!");
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
@@ -23538,30 +23884,24 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (!Src.getNode())
return SDValue();
- // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
- // instruction. Since the shift amount is in-range-or-undefined, we know
- // that doing a bittest on the i32 value is ok. We extend to i32 because
- // the encoding for the i16 version is larger than the i32 version.
- // Also promote i16 to i32 for performance / code size reason.
- if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
- Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+ // Remove any bit flip.
+ if (isBitwiseNot(Src)) {
+ Src = Src.getOperand(0);
+ CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+ }
- // See if we can use the 32-bit instruction instead of the 64-bit one for a
- // shorter encoding. Since the former takes the modulo 32 of BitNo and the
- // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
- // known to be zero.
- if (Src.getValueType() == MVT::i64 &&
- DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
- Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+ // Attempt to create the X86ISD::BT node.
+ if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
+ X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return BT;
+ }
- // If the operand types disagree, extend the shift amount to match. Since
- // BT ignores high bits (like shifts) we can use anyextend.
- if (Src.getValueType() != BitNo.getValueType())
- BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+ return SDValue();
+}
- X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
- dl, MVT::i8);
- return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+// Check if pre-AVX condcode can be performed by a single FCMP op.
+static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
+ return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
}
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
@@ -23831,7 +24171,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
// emit two comparisons and a logic op to tie them together.
- if (SSECC >= 8) {
+ if (!cheapX86FSETCC_SSE(Cond)) {
// LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
@@ -23996,10 +24336,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
- if (VT == MVT::v32i16 || VT == MVT::v64i8) {
- assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+ // Break 512-bit integer vector compare into smaller ones.
+ // TODO: Try harder to use VPCMPx + VPMOV2x?
+ if (VT.is512BitVector())
return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
- }
// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
// not-of-PCMPEQ:
@@ -24117,12 +24457,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
- SDValue SB;
- if (FlipSigns) {
- SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
- } else {
- SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
- }
+ SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
+ : 0x0000000080000000ULL,
+ dl, MVT::v2i64);
+
Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
@@ -24261,8 +24599,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
+ X86::CondCode X86CondCode;
+ if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
+ X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
return BT;
+ }
}
// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
@@ -24527,6 +24868,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op1.getSimpleValueType();
SDValue CC;
+ if (isSoftFP16(VT))
+ return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
+ DAG.getBitcast(MVT::i16, Op1),
+ DAG.getBitcast(MVT::i16, Op2)));
+
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
@@ -24591,7 +24937,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
- if (Cond.getOpcode() == ISD::SETCC) {
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
// If the condition was updated, it's possible that the operands of the
@@ -24608,6 +24955,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
+ // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
+ // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
@@ -24624,7 +24973,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
};
- if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+ if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
// Keep Cmp.
@@ -24652,7 +25001,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
Sub.getValue(1));
return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
- } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+ } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
Cmp.getOperand(0).getOpcode() == ISD::AND &&
isOneConstant(Cmp.getOperand(0).getOperand(1))) {
SDValue Src1, Src2;
@@ -24688,6 +25037,22 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
}
+ } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
+ Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
+ ((CondCode == X86::COND_S) || // smin(x, 0)
+ (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
+ // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
+ //
+ // If the comparison is testing for a positive value, we have to invert
+ // the sign bit mask, so only do that transform if the target has a
+ // bitwise 'and not' instruction (the invert is free).
+ // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
+ unsigned ShCt = VT.getSizeInBits() - 1;
+ SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
+ SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
+ if (CondCode == X86::COND_G)
+ Shift = DAG.getNOT(DL, Shift, VT);
+ return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
}
}
@@ -24707,7 +25072,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = Cond.getOperand(1);
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
- !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
+ !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -24734,9 +25099,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- SDValue BTCC;
- if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
- CC = BTCC;
+ X86::CondCode X86CondCode;
+ if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+ CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
Cond = BT;
AddTest = false;
}
@@ -24788,7 +25153,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// legal, but EmitLoweredSelect() can not deal with these extensions
// being inserted between two CMOV's. (in i16 case too TBN)
// https://bugs.llvm.org/show_bug.cgi?id=40974
- if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+ if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
!X86::mayFoldLoad(Op2, Subtarget))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
@@ -25153,16 +25518,20 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
!Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
- if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ if (StoredVal.hasOneUse() &&
+ collectConcatOps(StoredVal.getNode(), CatOps, DAG))
return splitVectorStore(St, DAG);
return SDValue();
}
+ if (StoreVT.is32BitVector())
+ return SDValue();
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
- "Unexpected VT");
+ assert(StoreVT.is64BitVector() && "Unexpected VT");
assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
- TargetLowering::TypeWidenVector && "Unexpected type action!");
+ TargetLowering::TypeWidenVector &&
+ "Unexpected type action!");
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
@@ -25247,8 +25616,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
+ // Bail out when we don't have native compare instructions.
if (Cond.getOpcode() == ISD::SETCC &&
- Cond.getOperand(0).getValueType() != MVT::f128) {
+ Cond.getOperand(0).getValueType() != MVT::f128 &&
+ !isSoftFP16(Cond.getOperand(0).getValueType())) {
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -25647,116 +26018,116 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
// Fold this packed vector shift into a build vector if SrcOp is a
// vector of Constants or UNDEFs.
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
- SmallVector<SDValue, 8> Elts;
- unsigned NumElts = SrcOp->getNumOperands();
-
+ unsigned ShiftOpc;
switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
- for (unsigned i = 0; i != NumElts; ++i) {
- SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->isUndef()) {
- // Must produce 0s in the correct bits.
- Elts.push_back(DAG.getConstant(0, dl, ElementType));
- continue;
- }
- auto *ND = cast<ConstantSDNode>(CurrentOp);
- const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
- }
+ ShiftOpc = ISD::SHL;
break;
case X86ISD::VSRLI:
- for (unsigned i = 0; i != NumElts; ++i) {
- SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->isUndef()) {
- // Must produce 0s in the correct bits.
- Elts.push_back(DAG.getConstant(0, dl, ElementType));
- continue;
- }
- auto *ND = cast<ConstantSDNode>(CurrentOp);
- const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
- }
+ ShiftOpc = ISD::SRL;
break;
case X86ISD::VSRAI:
- for (unsigned i = 0; i != NumElts; ++i) {
- SDValue CurrentOp = SrcOp->getOperand(i);
- if (CurrentOp->isUndef()) {
- // All shifted in bits must be the same so use 0.
- Elts.push_back(DAG.getConstant(0, dl, ElementType));
- continue;
- }
- auto *ND = cast<ConstantSDNode>(CurrentOp);
- const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
- }
+ ShiftOpc = ISD::SRA;
break;
}
- return DAG.getBuildVector(VT, dl, Elts);
+ SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
+ if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
+ return C;
}
return DAG.getNode(Opc, dl, VT, SrcOp,
DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
}
-/// Handle vector element shifts where the shift amount may or may not be a
-/// constant. Takes immediate version of shift as input.
-/// TODO: Replace with vector + (splat) idx to avoid extract_element nodes.
+/// Handle vector element shifts by a splat shift amount
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
- SDValue SrcOp, SDValue ShAmt,
+ SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- MVT SVT = ShAmt.getSimpleValueType();
- assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
-
- // Change opcode to non-immediate version.
- Opc = getTargetVShiftUniformOpcode(Opc, true);
-
- // Need to build a vector containing shift amount.
- // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
- // +====================+============+=======================================+
- // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
- // +====================+============+=======================================+
- // | i64 | Yes, No | Use ShAmt as lowest elt |
- // | i32 | Yes | zero-extend in-reg |
- // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
- // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
- // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
- // +====================+============+=======================================+
-
- if (SVT == MVT::i64)
- ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
- else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
- ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
- ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
+ MVT AmtVT = ShAmt.getSimpleValueType();
+ assert(AmtVT.isVector() && "Vector shift type mismatch");
+ assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
+ "Illegal vector splat index");
+
+ // Move the splat element to the bottom element.
+ if (ShAmtIdx != 0) {
+ SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
+ Mask[0] = ShAmtIdx;
+ ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
+ }
+
+ // Peek through any zext node if we can get back to a 128-bit source.
+ if (AmtVT.getScalarSizeInBits() == 64 &&
+ (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
+ ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+ ShAmt.getOperand(0).getValueType().isSimple() &&
+ ShAmt.getOperand(0).getValueType().is128BitVector()) {
ShAmt = ShAmt.getOperand(0);
- MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
- ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
- if (Subtarget.hasSSE41())
+ AmtVT = ShAmt.getSimpleValueType();
+ }
+
+ // See if we can mask off the upper elements using the existing source node.
+ // The shift uses the entire lower 64-bits of the amount vector, so no need to
+ // do this for vXi64 types.
+ bool IsMasked = false;
+ if (AmtVT.getScalarSizeInBits() < 64) {
+ if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
+ ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ // If the shift amount has come from a scalar, then zero-extend the scalar
+ // before moving to the vector.
+ ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
+ AmtVT = MVT::v4i32;
+ IsMasked = true;
+ } else if (ShAmt.getOpcode() == ISD::AND) {
+ // See if the shift amount is already masked (e.g. for rotation modulo),
+ // then we can zero-extend it by setting all the other mask elements to
+ // zero.
+ SmallVector<SDValue> MaskElts(
+ AmtVT.getVectorNumElements(),
+ DAG.getConstant(0, dl, AmtVT.getScalarType()));
+ MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
+ SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
+ if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
+ {ShAmt.getOperand(1), Mask}))) {
+ ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
+ IsMasked = true;
+ }
+ }
+ }
+
+ // Extract if the shift amount vector is larger than 128-bits.
+ if (AmtVT.getSizeInBits() > 128) {
+ ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
+ AmtVT = ShAmt.getSimpleValueType();
+ }
+
+ // Zero-extend bottom element to v2i64 vector type, either by extension or
+ // shuffle masking.
+ if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
+ if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
+ ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
+ ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+ } else if (Subtarget.hasSSE41()) {
ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
MVT::v2i64, ShAmt);
- else {
+ } else {
SDValue ByteShift = DAG.getTargetConstant(
- (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
+ (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
ByteShift);
ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
ByteShift);
}
- } else if (Subtarget.hasSSE41() &&
- ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
- ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
- MVT::v2i64, ShAmt);
- } else {
- SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
- DAG.getUNDEF(SVT)};
- ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
}
+ // Change opcode to non-immediate version.
+ Opc = getTargetVShiftUniformOpcode(Opc, true);
+
// The return type has to be a 128-bit type with the same element
// type as the input type.
MVT EltVT = VT.getVectorElementType();
@@ -25907,8 +26278,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
// prologue to RBP in the parent function.
- const X86Subtarget &Subtarget =
- static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
if (Subtarget.is64Bit())
return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
@@ -26444,6 +26814,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case VSHIFT: {
SDValue SrcOp = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
+ assert(ShAmt.getValueType() == MVT::i32 &&
+ "Unexpected VSHIFT amount type");
// Catch shift-by-constant.
if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
@@ -26451,8 +26823,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getSimpleValueType(), SrcOp,
CShAmt->getZExtValue(), DAG);
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
- SrcOp, ShAmt, Subtarget, DAG);
+ SrcOp, ShAmt, 0, Subtarget, DAG);
}
case COMPRESS_EXPAND_IN_REG: {
SDValue Mask = Op.getOperand(3);
@@ -27411,6 +27784,30 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
+ case Intrinsic::x86_atomic_bts:
+ case Intrinsic::x86_atomic_btc:
+ case Intrinsic::x86_atomic_btr: {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(2);
+ SDValue Op2 = Op.getOperand(3);
+ unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
+ : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
+ : X86ISD::LBTR;
+ SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
+ MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
+ SDValue Res =
+ DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Op1, Op2, Size}, VT, MMO);
+ Chain = Res.getValue(1);
+ Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+ unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+ if (Imm)
+ Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+ DAG.getShiftAmountConstant(Imm, VT, DL));
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
+ }
}
return SDValue();
}
@@ -28394,11 +28791,27 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
-static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+
+ // For AVX1 cases, split to use legal ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
+ // Default to expand.
+ return SDValue();
+}
+
+static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- // For AVX1 cases, split to use legal ops (everything but v4i64).
- if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
+ // For AVX1 cases, split to use legal ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v32i16 || VT == MVT::v64i8)
@@ -29188,19 +29601,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
- unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
- if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
- if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
- MVT EltVT = VT.getVectorElementType();
- assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
- if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
- else if (EltVT.bitsLT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
-
- return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
- }
+ int BaseShAmtIdx = -1;
+ if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
+ if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
+ Subtarget, DAG);
// vXi8 shifts - shift as v8i16 + mask result.
if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
@@ -29212,13 +29618,12 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
- BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
// Create the mask using vXi16 shifts. For shift-rights we need to move
// the upper byte down before splatting the vXi8 mask.
SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
- BaseShAmt, Subtarget, DAG);
+ BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
if (Opcode != ISD::SHL)
BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
8, DAG);
@@ -29228,7 +29633,7 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
DAG.getBitcast(ExtVT, R), BaseShAmt,
- Subtarget, DAG);
+ BaseShAmtIdx, Subtarget, DAG);
Res = DAG.getBitcast(VT, Res);
Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
@@ -29236,8 +29641,9 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
- SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
- BaseShAmt, Subtarget, DAG);
+ SignMask =
+ getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
+ BaseShAmtIdx, Subtarget, DAG);
SignMask = DAG.getBitcast(VT, SignMask);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
@@ -29247,23 +29653,6 @@ static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
}
}
- // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
- if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
- Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
- Amt = Amt.getOperand(0);
- unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
- std::vector<SDValue> Vals(Ratio);
- for (unsigned i = 0; i != Ratio; ++i)
- Vals[i] = Amt.getOperand(i);
- for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
- for (unsigned j = 0; j != Ratio; ++j)
- if (Vals[j] != Amt.getOperand(i + j))
- return SDValue();
- }
-
- if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
- return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
- }
return SDValue();
}
@@ -29843,8 +30232,8 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
{Op0, Op1, Amt}, DAG, Subtarget);
}
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
- VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
- VT == MVT::v16i32) &&
+ VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
+ VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
"Unexpected funnel shift type!");
// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
@@ -29867,7 +30256,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
// Split 256-bit integers on XOP/pre-AVX2 targets.
// Split 512-bit integers on non 512-bit BWI targets.
- if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) ||
+ if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
!Subtarget.hasAVX2())) ||
(VT.is512BitVector() && !Subtarget.useBWIRegs() &&
EltSizeInBits < 32)) {
@@ -29878,18 +30267,18 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
- if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) {
+ int ScalarAmtIdx = -1;
+ if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
// Uniform vXi16 funnel shifts can be efficiently handled by default.
if (EltSizeInBits == 16)
return SDValue();
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
- ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32);
- Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget,
- DAG);
- Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget,
- DAG);
+ Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
+ ScalarAmtIdx, Subtarget, DAG);
+ Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
+ ScalarAmtIdx, Subtarget, DAG);
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
}
}
@@ -30079,18 +30468,20 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// Attempt to fold as unpack(x,x) << zext(splat(y)):
// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
- // TODO: Handle vXi16 cases on all targets.
- if (EltSizeInBits == 8 || EltSizeInBits == 32 ||
- (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) {
- if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) {
+ if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
+ int BaseRotAmtIdx = -1;
+ if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
+ if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
+ unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
+ return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+ }
unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
- BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32);
Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
- Subtarget, DAG);
+ BaseRotAmtIdx, Subtarget, DAG);
Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
- Subtarget, DAG);
+ BaseRotAmtIdx, Subtarget, DAG);
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
}
}
@@ -30273,14 +30664,15 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
+ return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget.canUseCMPXCHG16B();
return false;
}
-bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+TargetLoweringBase::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
bool NoImplicitFloatOps =
@@ -30288,9 +30680,10 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
- return false;
+ return AtomicExpansionKind::None;
- return needsCmpXchgNb(MemType);
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
+ : AtomicExpansionKind::None;
}
// Note: this turns large loads into lock cmpxchg8b/16b.
@@ -30314,6 +30707,65 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
}
TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
+ // If the atomicrmw's result isn't actually used, we can just add a "lock"
+ // prefix to a normal instruction for these operations.
+ if (AI->use_empty())
+ return AtomicExpansionKind::None;
+
+ // If the atomicrmw's result is used by a single bit AND, we may use
+ // bts/btr/btc instruction for these operations.
+ auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+ Instruction *I = AI->user_back();
+ if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+ AI->getParent() != I->getParent())
+ return AtomicExpansionKind::CmpXChg;
+ // The following instruction must be a AND single bit.
+ auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+ unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
+ if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+ return AtomicExpansionKind::CmpXChg;
+
+ if (AI->getOperation() == AtomicRMWInst::And)
+ return ~C1->getValue() == C2->getValue()
+ ? AtomicExpansionKind::BitTestIntrinsic
+ : AtomicExpansionKind::CmpXChg;
+
+ return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
+ : AtomicExpansionKind::CmpXChg;
+}
+
+void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
+ IRBuilder<> Builder(AI);
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ switch (AI->getOperation()) {
+ default:
+ llvm_unreachable("Unknown atomic operation");
+ case AtomicRMWInst::Or:
+ IID = Intrinsic::x86_atomic_bts;
+ break;
+ case AtomicRMWInst::Xor:
+ IID = Intrinsic::x86_atomic_btc;
+ break;
+ case AtomicRMWInst::And:
+ IID = Intrinsic::x86_atomic_btr;
+ break;
+ }
+ Instruction *I = AI->user_back();
+ LLVMContext &Ctx = AI->getContext();
+ unsigned Imm =
+ countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
+ Function *BitTest =
+ Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
+ Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
+ Type::getInt8PtrTy(Ctx));
+ Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+ I->replaceAllUsesWith(Result);
+ I->eraseFromParent();
+ AI->eraseFromParent();
+}
+
+TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
@@ -30337,10 +30789,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
- // If the atomicrmw's result isn't actually used, we can just add a "lock"
- // prefix to a normal instruction for these operations.
- return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
- : AtomicExpansionKind::None;
+ return shouldExpandLogicAtomicRMWInIR(AI);
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
@@ -31552,16 +32001,12 @@ SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
// require special handling for these nodes), lower them as literal NOOPs for
// the time being.
SmallVector<SDValue, 2> Ops;
-
Ops.push_back(Op.getOperand(0));
if (Op->getGluedNode())
Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
- SDLoc OpDL(Op);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
-
- return NOOP;
+ return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
}
// Custom split CVTPS2PH with wide types.
@@ -31710,8 +32155,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
- case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
+ case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
@@ -31807,9 +32253,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
- case X86ISD::VPMADDWD:
- case X86ISD::AVG: {
- // Legalize types for X86ISD::AVG/VPMADDWD by widening.
+ case X86ISD::VPMADDWD: {
+ // Legalize types for X86ISD::VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
@@ -32462,7 +32907,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
- assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
+ assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
@@ -32821,6 +33266,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOR)
NODE_NAME_CASE(LXOR)
NODE_NAME_CASE(LAND)
+ NODE_NAME_CASE(LBTS)
+ NODE_NAME_CASE(LBTC)
+ NODE_NAME_CASE(LBTR)
NODE_NAME_CASE(VZEXT_MOVL)
NODE_NAME_CASE(VZEXT_LOAD)
NODE_NAME_CASE(VEXTRACT_STORE)
@@ -33041,7 +33489,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SCALEF_RND)
NODE_NAME_CASE(SCALEFS)
NODE_NAME_CASE(SCALEFS_RND)
- NODE_NAME_CASE(AVG)
NODE_NAME_CASE(MULHRS)
NODE_NAME_CASE(SINT_TO_FP_RND)
NODE_NAME_CASE(UINT_TO_FP_RND)
@@ -33222,7 +33669,6 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
switch (Opcode) {
// TODO: Add more X86ISD opcodes once we have test coverage.
- case X86ISD::AVG:
case X86ISD::PCMPEQ:
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ:
@@ -33418,6 +33864,20 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
}
+bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
+ EVT VT) const {
+ // TODO: This is too general. There are cases where pre-AVX512 codegen would
+ // benefit. The transform may also be profitable for scalar code.
+ if (!Subtarget.hasAVX512())
+ return false;
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ return false;
+ if (!VT.isVector())
+ return false;
+
+ return true;
+}
+
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -33460,6 +33920,16 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
return TargetLowering::areJTsAllowed(Fn);
}
+MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
+ EVT ConditionVT) const {
+ // Avoid 8 and 16 bit types because they increase the chance for unnecessary
+ // zero-extensions.
+ if (ConditionVT.getSizeInBits() < 32)
+ return MVT::i32;
+ return TargetLoweringBase::getPreferredSwitchConditionType(Context,
+ ConditionVT);
+}
+
//===----------------------------------------------------------------------===//
// X86 Scheduler Hooks
//===----------------------------------------------------------------------===//
@@ -33871,6 +34341,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
// conditional jump around it.
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
+ case X86::CMOV_FR16:
case X86::CMOV_FR16X:
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
@@ -34090,7 +34561,7 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
- Register DestReg = FirstCMOV.getOperand(0).getReg();
+ Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
Register Op1Reg = FirstCMOV.getOperand(1).getReg();
Register Op2Reg = FirstCMOV.getOperand(2).getReg();
MachineInstrBuilder MIB =
@@ -34103,11 +34574,6 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// The second SecondInsertedMBB provides the same incoming value as the
// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
- // Copy the PHI result to the register defined by the second CMOV.
- BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
- TII->get(TargetOpcode::COPY),
- SecondCascadedCMOV.getOperand(0).getReg())
- .addReg(FirstCMOV.getOperand(0).getReg());
// Now remove the CMOVs.
FirstCMOV.eraseFromParent();
@@ -35546,6 +36012,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
+ case X86::CMOV_FR16:
+ case X86::CMOV_FR16X:
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
case X86::CMOV_FR64:
@@ -36116,6 +36584,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::AND: {
+ if (Op.getResNo() == 0) {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known &= Known2;
+ }
+ break;
+ }
case X86ISD::ANDNP: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -36257,6 +36734,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.setAllZero();
break;
}
+ case X86ISD::VBROADCAST_LOAD: {
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
+ /*AllowWholeUndefs*/ false,
+ /*AllowPartialUndefs*/ false)) {
+ Known.Zero.setAllBits();
+ Known.One.setAllBits();
+ for (unsigned I = 0; I != NumElts; ++I) {
+ if (!DemandedElts[I])
+ continue;
+ if (UndefElts[I]) {
+ Known.resetAll();
+ break;
+ }
+ KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
+ Known = KnownBits::commonBits(Known, Known2);
+ }
+ return;
+ }
+ break;
+ }
}
// Handle target shuffles.
@@ -37113,9 +37612,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumRootElts = RootVT.getVectorNumElements();
// Canonicalize shuffle input op to the requested type.
- // TODO: Support cases where Op is smaller than VT.
auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
- if (VT.getSizeInBits() < Op.getValueSizeInBits())
+ if (VT.getSizeInBits() > Op.getValueSizeInBits())
+ Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
+ else if (VT.getSizeInBits() < Op.getValueSizeInBits())
Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
return DAG.getBitcast(VT, Op);
};
@@ -37129,8 +37629,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
- assert(VT1.getSizeInBits() == RootSizeInBits &&
- VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
+ assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
+ (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
SDValue Res;
@@ -37157,12 +37657,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- // If we are shuffling a broadcast (and not introducing zeros) then
- // we can just use the broadcast directly. This works for smaller broadcast
- // elements as well as they already repeat across each mask element
- if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+ // If we are shuffling a splat (and not introducing zeros) then we can just
+ // use it directly. This works for smaller elements as well as they already
+ // repeat across each mask element.
+ if (UnaryShuffle && !isAnyZero(BaseMask) &&
+ V1.getValueSizeInBits() >= RootSizeInBits &&
(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
- V1.getValueSizeInBits() >= RootSizeInBits) {
+ DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
return CanonicalizeShuffleInput(RootVT, V1);
}
@@ -37543,7 +38044,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(RootVT.is128BitVector() && Subtarget.hasVLX())) &&
(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
- if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+ // Bail if this was already a truncation or PACK node.
+ // We sometimes fail to match PACK if we demand known undef elements.
+ if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
+ Root.getOpcode() == X86ISD::PACKSS ||
+ Root.getOpcode() == X86ISD::PACKUS))
return SDValue(); // Nothing to do!
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
@@ -37852,6 +38357,12 @@ static SDValue combineX86ShuffleChainWithExtract(
unsigned RootSizeInBits = RootVT.getSizeInBits();
assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
+ // Bail if we have any smaller inputs.
+ if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
+ return Input.getValueSizeInBits() < RootSizeInBits;
+ }))
+ return SDValue();
+
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
@@ -37894,16 +38405,6 @@ static SDValue combineX86ShuffleChainWithExtract(
}))
return SDValue();
- for (SDValue &NewInput : WideInputs) {
- assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
- "Shuffle vector size mismatch");
- if (WideSizeInBits > NewInput.getValueSizeInBits())
- NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
- SDLoc(NewInput), WideSizeInBits);
- assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
- "Unexpected subvector extraction");
- }
-
// Create new mask for larger type.
for (unsigned i = 1; i != NumInputs; ++i)
Offsets[i] += i * Scale * NumMaskElts;
@@ -37928,7 +38429,10 @@ static SDValue combineX86ShuffleChainWithExtract(
// Attempt to combine wider chain.
// TODO: Can we use a better Root?
- SDValue WideRoot = WideInputs[0];
+ SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
+ WideInputs.back().getValueSizeInBits()
+ ? WideInputs.front()
+ : WideInputs.back();
if (SDValue WideShuffle =
combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
HasVariableMask, AllowVariableCrossLaneMask,
@@ -38267,9 +38771,9 @@ static SDValue combineX86ShufflesRecursively(
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
- assert(Root.getSimpleValueType().isVector() &&
- "Shuffles operate on vector types!");
- unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+ MVT RootVT = Root.getSimpleValueType();
+ assert(RootVT.isVector() && "Shuffles operate on vector types!");
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
@@ -38298,16 +38802,27 @@ static SDValue combineX86ShufflesRecursively(
APInt OpUndef, OpZero;
APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
- if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
- OpZero, DAG, Depth, false))
- return SDValue();
-
- // Shuffle inputs must not be larger than the shuffle result.
- // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
- if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
- return OpInput.getValueSizeInBits() > VT.getSizeInBits();
- }))
+ if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, DAG, Depth, false)) {
+ // Shuffle inputs must not be larger than the shuffle result.
+ // TODO: Relax this for single input faux shuffles (e.g. trunc).
+ if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+ return OpInput.getValueSizeInBits() > VT.getSizeInBits();
+ }))
+ return SDValue();
+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
+ !isNullConstant(Op.getOperand(1))) {
+ SDValue SrcVec = Op.getOperand(0);
+ int ExtractIdx = Op.getConstantOperandVal(1);
+ unsigned NumElts = VT.getVectorNumElements();
+ OpInputs.assign({SrcVec});
+ OpMask.assign(NumElts, SM_SentinelUndef);
+ std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
+ OpZero = OpUndef = APInt::getNullValue(NumElts);
+ } else {
return SDValue();
+ }
// If the shuffle result was smaller than the root, we need to adjust the
// mask indices and pad the mask with undefs.
@@ -38467,13 +38982,12 @@ static SDValue combineX86ShufflesRecursively(
// Handle the all undef/zero/ones cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
- return DAG.getUNDEF(Root.getValueType());
+ return DAG.getUNDEF(RootVT);
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
- return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
- SDLoc(Root));
+ return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
- return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
+ return getOnesVector(RootVT, DAG, SDLoc(Root));
assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= IsOpVariableMask;
@@ -38533,7 +39047,7 @@ static SDValue combineX86ShufflesRecursively(
// NOTE: This will update the Ops and Mask.
if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
- return DAG.getBitcast(Root.getValueType(), HOp);
+ return DAG.getBitcast(RootVT, HOp);
// Try to refine our inputs given our knowledge of target shuffle mask.
for (auto I : enumerate(Ops)) {
@@ -38578,6 +39092,8 @@ static SDValue combineX86ShufflesRecursively(
// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
// Widen any subvector shuffle inputs we've collected.
+ // TODO: Remove this to avoid generating temporary nodes, we should only
+ // widen once combineX86ShuffleChain has found a match.
if (any_of(Ops, [RootSizeInBits](SDValue Op) {
return Op.getValueSizeInBits() < RootSizeInBits;
})) {
@@ -38823,8 +39339,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
SDValue N0 = V.getOperand(0);
SDValue N1 = V.getOperand(1);
unsigned Imm = V.getConstantOperandVal(2);
- const X86Subtarget &Subtarget =
- static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
@@ -38869,21 +39384,24 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ShuffleVT = N.getValueType();
- auto IsMergeableWithShuffle = [](SDValue Op) {
+ auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
// AllZeros/AllOnes constants are freely shuffled and will peek through
// bitcasts. Other constant build vectors do not peek through bitcasts. Only
// merge with target shuffles if it has one use so shuffle combining is
- // likely to kick in.
+ // likely to kick in. Shuffles of splats are expected to be removed.
return ISD::isBuildVectorAllOnes(Op.getNode()) ||
ISD::isBuildVectorAllZeros(Op.getNode()) ||
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
- (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
+ (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
+ (FoldLoad && isShuffleFoldableLoad(Op)) ||
+ DAG.isSplatValue(Op, /*AllowUndefs*/ false);
};
auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
// Ensure we only shuffle whole vector src elements, unless its a logical
// binops where we can more aggressively move shuffles from dst to src.
return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
+ BinOp == X86ISD::ANDNP ||
(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
};
@@ -38913,7 +39431,8 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
- if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
+ if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
+ IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
SDValue LHS, RHS;
Op00 = DAG.getBitcast(ShuffleVT, Op00);
Op01 = DAG.getBitcast(ShuffleVT, Op01);
@@ -39054,6 +39573,11 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
+ // FIXME: Remove this after we support vector FP16
+ if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(),
+ Subtarget))
+ return SDValue();
+
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
@@ -39471,7 +39995,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
SmallVector<SDValue> SubOps;
- if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+ if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
return SubOps[Idx & 1];
unsigned NumElts = Src.getValueType().getVectorNumElements();
if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
@@ -39581,7 +40105,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
// No change if element is already zero or the inserted element.
continue;
- } else if (KnownUndef0[i] || KnownZero0[i]) {
+ }
+
+ if (KnownUndef0[i] || KnownZero0[i]) {
// If the target mask is undef/zero then we must zero the element.
InsertPSMask |= (1u << i);
Updated = true;
@@ -40016,16 +40542,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// Simplify source operands based on shuffle mask.
// TODO - merge this into combineX86ShufflesRecursively.
- APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
- DCI))
+ if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
return SDValue(N, 0);
// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
// Perform this after other shuffle combines to allow inner shuffles to be
// combined away first.
- if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
+ if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
return BinOp;
}
@@ -40212,6 +40736,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
Depth + 1))
return true;
+ // Fold shift(0,x) -> 0
+ if (DemandedElts.isSubsetOf(KnownZero))
+ return TLO.CombineTo(
+ Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnes())
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
@@ -40232,9 +40761,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
+
+ // Fold shift(0,x) -> 0
+ if (DemandedElts.isSubsetOf(LHSZero))
+ return TLO.CombineTo(
+ Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
+
if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
+
KnownZero = LHSZero;
break;
}
@@ -40316,6 +40852,57 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownZero.setHighBits(ShiftAmt);
break;
}
+ case X86ISD::ANDNP: {
+ // ANDNP = (~LHS & RHS);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
+ APInt UndefElts;
+ SmallVector<APInt> EltBits;
+ int NumElts = VT.getVectorNumElements();
+ int EltSizeInBits = VT.getScalarSizeInBits();
+ APInt OpBits = APInt::getAllOnes(EltSizeInBits);
+ APInt OpElts = DemandedElts;
+ if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+ EltBits)) {
+ OpBits.clearAllBits();
+ OpElts.clearAllBits();
+ for (int I = 0; I != NumElts; ++I)
+ if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero()))) {
+ OpBits |= Invert ? ~EltBits[I] : EltBits[I];
+ OpElts.setBit(I);
+ }
+ }
+ return std::make_pair(OpBits, OpElts);
+ };
+ std::pair<APInt, APInt> DemandLHS = GetDemandedMasks(RHS);
+ std::pair<APInt, APInt> DemandRHS = GetDemandedMasks(LHS, true);
+
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ if (SimplifyDemandedVectorElts(LHS, DemandLHS.second, LHSUndef, LHSZero,
+ TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandRHS.second, RHSUndef, RHSZero,
+ TLO, Depth + 1))
+ return true;
+
+ if (!DemandedElts.isAllOnes()) {
+ SDValue NewLHS = SimplifyMultipleUseDemandedBits(
+ LHS, DemandLHS.first, DemandLHS.second, TLO.DAG, Depth + 1);
+ SDValue NewRHS = SimplifyMultipleUseDemandedBits(
+ RHS, DemandRHS.first, DemandRHS.second, TLO.DAG, Depth + 1);
+ if (NewLHS || NewRHS) {
+ NewLHS = NewLHS ? NewLHS : LHS;
+ NewRHS = NewRHS ? NewRHS : RHS;
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
+ }
+ }
+ break;
+ }
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: {
SDValue Src = Op.getOperand(0);
@@ -40620,7 +41207,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
// Integer ops.
- case X86ISD::AVG:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
@@ -40651,10 +41237,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
}
- // For broadcasts, unless we *only* demand the 0'th element,
+ // For splats, unless we *only* demand the 0'th element,
// stop attempts at simplification here, we aren't going to improve things,
// this is better than any potential shuffle.
- if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
+ if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
return false;
// Get target/faux shuffle mask.
@@ -40770,20 +41356,31 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
KnownBits KnownOp;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
+
+ // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
- TLO, Depth + 1))
+ APInt DemandedMaskLHS = APInt::getAllOnes(64);
+ APInt DemandedMaskRHS = APInt::getAllOnes(64);
+
+ bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
+ if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
+ DemandedMaskLHS = DemandedMask;
+ if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
+ DemandedMaskRHS = DemandedMask;
+
+ if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
+ KnownOp, TLO, Depth + 1))
return true;
- if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
- TLO, Depth + 1))
+ if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
+ KnownOp, TLO, Depth + 1))
return true;
// Aggressively peek through ops to get at the demanded low bits.
SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
- LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
- RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
if (DemandedLHS || DemandedRHS) {
DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
@@ -41084,7 +41681,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero = KnownZero.zextOrSelf(BitWidth);
+ Known.Zero = KnownZero.zext(BitWidth);
Known.Zero.setHighBits(BitWidth - NumElts);
// MOVMSK only uses the MSB from each vector element.
@@ -41291,12 +41888,8 @@ bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
switch (Opc) {
case X86ISD::VBROADCAST:
case X86ISD::VBROADCAST_LOAD:
- // TODO: Permit vXi64 types on 32-bit targets.
- if (isTypeLegal(Op.getValueType().getVectorElementType())) {
- UndefElts = APInt::getNullValue(NumElts);
- return true;
- }
- return false;
+ UndefElts = APInt::getNullValue(NumElts);
+ return true;
}
return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
@@ -42840,10 +43433,29 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
return SDValue();
SDLoc DL(ExtElt);
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
+
+ // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
+ auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
+ if (V.getValueType() == MVT::v4i8) {
+ if (ZeroExtend && Subtarget.hasSSE41()) {
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ DAG.getConstant(0, DL, MVT::v4i32),
+ DAG.getBitcast(MVT::i32, V),
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(MVT::v16i8, V);
+ }
+ V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
+ ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
+ : DAG.getUNDEF(MVT::v4i8));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
+ DAG.getUNDEF(MVT::v8i8));
+ };
// vXi8 mul reduction - promote to vXi16 mul reduction.
if (Opc == ISD::MUL) {
- unsigned NumElts = VecVT.getVectorNumElements();
if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
return SDValue();
if (VecVT.getSizeInBits() >= 128) {
@@ -42858,11 +43470,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
}
} else {
- if (VecVT == MVT::v4i8)
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
- DAG.getUNDEF(MVT::v4i8));
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
- DAG.getUNDEF(MVT::v8i8));
+ Rdx = WidenToV16I8(Rdx, false);
Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
}
@@ -42882,24 +43490,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
// vXi8 add reduction - sub 128-bit vector.
if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
- if (VecVT == MVT::v4i8) {
- // Pad with zero.
- if (Subtarget.hasSSE41()) {
- Rdx = DAG.getBitcast(MVT::i32, Rdx);
- Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
- DAG.getConstant(0, DL, MVT::v4i32), Rdx,
- DAG.getIntPtrConstant(0, DL));
- Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- } else {
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
- DAG.getConstant(0, DL, VecVT));
- }
- }
- if (Rdx.getValueType() == MVT::v8i8) {
- // Pad with undef.
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
- DAG.getUNDEF(MVT::v8i8));
- }
+ Rdx = WidenToV16I8(Rdx, true);
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
DAG.getConstant(0, DL, MVT::v16i8));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
@@ -42907,8 +43498,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
}
// Must be a >=128-bit vector with pow2 elements.
- if ((VecVT.getSizeInBits() % 128) != 0 ||
- !isPowerOf2_32(VecVT.getVectorNumElements()))
+ if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
return SDValue();
// vXi8 add reduction - sum lo/hi halves then use PSADBW.
@@ -42931,6 +43521,48 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
}
+ // See if we can use vXi8 PSADBW add reduction for larger zext types.
+ // If the source vector values are 0-255, then we can use PSADBW to
+ // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
+ // TODO: See if its worth avoiding vXi16/i32 truncations?
+ if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
+ DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
+ (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
+ Subtarget.hasAVX512())) {
+ EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
+ Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
+ if (ByteVT.getSizeInBits() < 128)
+ Rdx = WidenToV16I8(Rdx, true);
+
+ // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+ auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+ SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
+ return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
+ };
+ MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
+ Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
+
+ // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
+ while (Rdx.getValueSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ VecVT = Lo.getValueType();
+ Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
+ }
+ assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
+
+ if (NumElts > 8) {
+ SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
+ Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
+ }
+
+ VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
+ Rdx = DAG.getBitcast(VecVT, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
if (!shouldUseHorizontalOp(true, DAG, Subtarget))
return SDValue();
@@ -42994,8 +43626,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
uint64_t Idx = CIdx->getZExtValue();
if (UndefVecElts[Idx])
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
- return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
- dl, VT);
+ return DAG.getConstant(EltBits[Idx].zext(VT.getScalarSizeInBits()), dl,
+ VT);
}
}
@@ -43076,29 +43708,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// but not
// i1 = extract_vector_elt t0:1, Constant:i64<2>
// since the latter would need its own MOVMSK.
- if (CIdx && SrcVT.getScalarType() == MVT::i1) {
+ if (SrcVT.getScalarType() == MVT::i1) {
+ bool IsVar = !CIdx;
SmallVector<SDNode *, 16> BoolExtracts;
unsigned ResNo = InputVector.getResNo();
- auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
+ auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isa<ConstantSDNode>(Use->getOperand(1)) &&
Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
+ IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
return true;
}
return false;
};
+ // TODO: Can we drop the oneuse check for constant extracts?
if (all_of(InputVector->uses(), IsBoolExtract) &&
- BoolExtracts.size() > 1) {
+ (IsVar || BoolExtracts.size() > 1)) {
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
for (SDNode *Use : BoolExtracts) {
// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
- unsigned MaskIdx = Use->getConstantOperandVal(1);
- APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
- SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
+ // Mask = 1 << MaskIdx
+ SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
+ SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
+ SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
DCI.CombineTo(Use, Res);
@@ -43123,7 +43758,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
- !LikelyUsedAsVector) {
+ !LikelyUsedAsVector && LoadVec->isSimple()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue NewPtr =
TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
@@ -43133,16 +43768,111 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
SDValue Load =
DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
- SDValue Chain = Load.getValue(1);
- SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)};
- SDValue To[] = {Load, Chain};
- DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
- return SDValue(N, 0);
+ DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
+ return Load;
}
return SDValue();
}
+// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
+// This is more or less the reverse of combineBitcastvxi1.
+static SDValue combineToExtendBoolVectorInReg(
+ unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ EVT SVT = VT.getScalarType();
+ EVT InSVT = N0.getValueType().getScalarType();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
+ // Input type must be extending a bool vector (bit-casted from a scalar
+ // integer) to legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
+ return SDValue();
+ if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ EVT SclVT = N00.getValueType();
+ if (!SclVT.isScalarInteger())
+ return SDValue();
+
+ SDValue Vec;
+ SmallVector<int> ShuffleMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
+
+ // Broadcast the scalar integer to the vector elements.
+ if (NumElts > EltSizeInBits) {
+ // If the scalar integer is greater than the vector element size, then we
+ // must split it down into sub-sections for broadcasting. For example:
+ // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
+ // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
+ assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
+ unsigned Scale = NumElts / EltSizeInBits;
+ EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ Vec = DAG.getBitcast(VT, Vec);
+
+ for (unsigned i = 0; i != Scale; ++i)
+ ShuffleMask.append(EltSizeInBits, i);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+ // If we have register broadcast instructions, use the scalar size as the
+ // element type for the shuffle. Then cast to the wider element type. The
+ // widened bits won't be used, and this might allow the use of a broadcast
+ // load.
+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+ unsigned Scale = EltSizeInBits / NumElts;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ ShuffleMask.append(NumElts * Scale, 0);
+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+ Vec = DAG.getBitcast(VT, Vec);
+ } else {
+ // For smaller scalar integers, we can simply any-extend it to the vector
+ // element size (we don't care about the upper bits) and broadcast it to all
+ // elements.
+ SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+ ShuffleMask.append(NumElts, 0);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ }
+
+ // Now, mask the relevant bit in each element.
+ SmallVector<SDValue, 32> Bits;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int BitIdx = (i % EltSizeInBits);
+ APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
+ Bits.push_back(DAG.getConstant(Bit, DL, SVT));
+ }
+ SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
+ Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
+
+ // Compare against the bitmask and extend the result.
+ EVT CCVT = VT.changeVectorElementType(MVT::i1);
+ Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
+ Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
+
+ // For SEXT, this is now done, otherwise shift the result down for
+ // zero-extension.
+ if (Opcode == ISD::SIGN_EXTEND)
+ return Vec;
+ return DAG.getNode(ISD::SRL, DL, VT, Vec,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+}
+
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
@@ -43270,8 +44000,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
SDValue FVal = N->getOperand(2);
SmallVector<SDValue, 4> CatOpsT, CatOpsF;
if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
- !collectConcatOps(TVal.getNode(), CatOpsT) ||
- !collectConcatOps(FVal.getNode(), CatOpsF))
+ !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
return SDValue();
auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -43360,19 +44090,17 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
if ((N->getOpcode() != ISD::VSELECT &&
N->getOpcode() != X86ISD::BLENDV) ||
ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
- // Don't optimize before the condition has been transformed to a legal type
- // and don't ever optimize vector selects that map to AVX512 mask-registers.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned BitWidth = Cond.getScalarValueSizeInBits();
- if (BitWidth < 8 || BitWidth > 64)
- return SDValue();
+ EVT VT = N->getValueType(0);
// We can only handle the cases where VSELECT is directly legal on the
// subtarget. We custom lower VSELECT nodes with constant conditions and
@@ -43384,8 +44112,6 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
// Potentially, we should combine constant-condition vselect nodes
// pre-legalization into shuffles and not mark as many types as custom
// lowered.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT VT = N->getValueType(0);
if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue();
// FIXME: We don't support i16-element blends currently. We could and
@@ -43403,6 +44129,11 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
if (VT.is512BitVector())
return SDValue();
+ // Don't optimize before the condition has been transformed to a legal type
+ // and don't ever optimize vector selects that map to AVX512 mask-registers.
+ if (BitWidth < 8 || BitWidth > 64)
+ return SDValue();
+
auto OnlyUsedAsSelectCond = [](SDValue Cond) {
for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI)
@@ -43542,9 +44273,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// Convert vselects with constant condition into shuffles.
- if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
+ if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
+ (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
SmallVector<int, 64> Mask;
- if (createShuffleMaskFromVSELECT(Mask, Cond))
+ if (createShuffleMaskFromVSELECT(Mask, Cond,
+ N->getOpcode() == X86ISD::BLENDV))
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
}
@@ -43565,11 +44298,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// getConstVector sets negative shuffle mask values as undef, so ensure
// we hardcode SM_SentinelZero values to zero (0x80).
if (CondMask[i] < NumElts) {
- LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i];
+ LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
RHSMask[i] = 0x80;
} else {
LHSMask[i] = 0x80;
- RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i];
+ RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
}
}
LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
@@ -43586,7 +44319,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && VT != MVT::f128 &&
+ VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget.hasSSE2() ||
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
@@ -43880,7 +44613,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If this an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
- Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
@@ -43889,6 +44622,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
+ // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
+ // get split by legalization.
+ if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
+ CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
+ TLI.isTypeLegal(VT.getScalarType())) {
+ EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
+ if (SDValue ExtCond = combineToExtendBoolVectorInReg(
+ ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
+ ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
+ return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
+ }
+ }
+
// Early exit check
if (!TLI.isTypeLegal(VT))
return SDValue();
@@ -44301,14 +45047,15 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
+ bool FoundAndLSB = false;
SDValue Carry = EFLAGS.getOperand(0);
while (Carry.getOpcode() == ISD::TRUNCATE ||
Carry.getOpcode() == ISD::ZERO_EXTEND ||
- Carry.getOpcode() == ISD::SIGN_EXTEND ||
- Carry.getOpcode() == ISD::ANY_EXTEND ||
(Carry.getOpcode() == ISD::AND &&
- isOneConstant(Carry.getOperand(1))))
+ isOneConstant(Carry.getOperand(1)))) {
+ FoundAndLSB |= Carry.getOpcode() == ISD::AND;
Carry = Carry.getOperand(0);
+ }
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
@@ -44339,6 +45086,14 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
CarryOp1.getOpcode() == X86ISD::ADD &&
isOneConstant(CarryOp1.getOperand(1)))
return CarryOp1;
+ } else if (FoundAndLSB) {
+ SDLoc DL(Carry);
+ SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
+ if (Carry.getOpcode() == ISD::SRL) {
+ BitNo = Carry.getOperand(1);
+ Carry = Carry.getOperand(0);
+ }
+ return getBT(Carry, BitNo, DL, DAG);
}
}
}
@@ -44533,6 +45288,12 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
if (!IsAnyOf && !IsAllOf)
return SDValue();
+ // TODO: Check more combining cases for me.
+ // Here we check the cmp use number to decide do combining or not.
+ // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
+ // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
+ bool IsOneUse = CmpOp.getNode()->hasOneUse();
+
// See if we can peek through to a vector with a wider element type, if the
// signbits extend down to all the sub-elements as well.
// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
@@ -44561,9 +45322,9 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
- if (VecVT.is256BitVector() && NumElts <= CmpBits) {
+ if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
SmallVector<SDValue> Ops;
- if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
+ if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
Ops.size() == 2) {
SDLoc DL(EFLAGS);
EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
@@ -44582,7 +45343,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
- if (IsAllOf && Subtarget.hasSSE41()) {
+ if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
SDValue BC = peekThroughBitcasts(Vec);
// Ensure MOVMSK was testing every signbit of BC.
@@ -44734,7 +45495,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
if (!(FalseOp.getValueType() == MVT::f80 ||
(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
- !Subtarget.hasCMov() || hasFPCMov(CC)) {
+ !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
@@ -45181,8 +45942,6 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return SDValue();
- EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
-
// With AVX512 but without BWI, we would need to split v32i16.
if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return SDValue();
@@ -45265,11 +46024,13 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
+ MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+ MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
+ DAG.getBitcast(OpVT, Ops[0]),
+ DAG.getBitcast(OpVT, Ops[1]));
};
- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
- { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
PMADDWDBuilder);
}
@@ -45622,12 +46383,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
SarConst = SarConst - (Size - ShiftSize);
if (SarConst == 0)
return NN;
- else if (SarConst.isNegative())
+ if (SarConst.isNegative())
return DAG.getNode(ISD::SHL, DL, VT, NN,
DAG.getConstant(-SarConst, DL, CVT));
- else
- return DAG.getNode(ISD::SRA, DL, VT, NN,
- DAG.getConstant(SarConst, DL, CVT));
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
}
return SDValue();
}
@@ -46034,11 +46794,9 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
EltBits[0].getZExtValue(), DAG);
}
- APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
- KnownZero, DCI))
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
return SDValue();
@@ -46461,11 +47219,17 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, FPLogic);
}
+ if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
+ !N1.hasOneUse())
+ return SDValue();
+
+ ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+ ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
+
// The vector ISA for FP predicates is incomplete before AVX, so converting
// COMIS* to CMPS* may not be a win before AVX.
- // TODO: Check types/predicates to see if they are available with SSE/SSE2.
- if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC ||
- !N0.hasOneUse() || !N1.hasOneUse())
+ if (!Subtarget.hasAVX() &&
+ !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
return SDValue();
// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
@@ -46482,10 +47246,8 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
- SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01,
- cast<CondCodeSDNode>(N0.getOperand(2))->get());
- SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11,
- cast<CondCodeSDNode>(N1.getOperand(2))->get());
+ SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
+ SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
}
@@ -46891,6 +47653,53 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
+ // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
+ // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
+ // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
+ if (VT.isVector() && getTargetConstantFromNode(N1)) {
+ unsigned Opc0 = N0.getOpcode();
+ if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
+ getTargetConstantFromNode(N0.getOperand(1)) &&
+ DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
+ N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
+ SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
+ return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
+ }
+ }
+
+ // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
+ // avoids slow variable shift (moving shift amount to ECX etc.)
+ if (isOneConstant(N1) && N0->hasOneUse()) {
+ SDValue Src = N0;
+ while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
+ Src.getOpcode() == ISD::TRUNCATE) &&
+ Src.getOperand(0)->hasOneUse())
+ Src = Src.getOperand(0);
+ bool ContainsNOT = false;
+ X86::CondCode X86CC = X86::COND_B;
+ // Peek through AND(NOT(SRL(X,Y)),1).
+ if (isBitwiseNot(Src)) {
+ Src = Src.getOperand(0);
+ X86CC = X86::COND_AE;
+ ContainsNOT = true;
+ }
+ if (Src.getOpcode() == ISD::SRL &&
+ !isa<ConstantSDNode>(Src.getOperand(1))) {
+ SDValue BitNo = Src.getOperand(1);
+ Src = Src.getOperand(0);
+ // Peek through AND(SRL(NOT(X),Y),1).
+ if (isBitwiseNot(Src)) {
+ Src = Src.getOperand(0);
+ X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
+ ContainsNOT = true;
+ }
+ // If we have BMI2 then SHRX should be faster for i32/i64 cases.
+ if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
+ if (SDValue BT = getBT(Src, BitNo, dl, DAG))
+ return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
+ }
+ }
+
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
// Attempt to recursively combine a bitmask AND with shuffles.
SDValue Op(N, 0);
@@ -46899,32 +47708,44 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// If either operand is a constant mask, then only the elements that aren't
// zero are actually demanded by the other operand.
- auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+ auto GetDemandedMasks = [&](SDValue Op) {
APInt UndefElts;
SmallVector<APInt> EltBits;
int NumElts = VT.getVectorNumElements();
int EltSizeInBits = VT.getScalarSizeInBits();
- if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
- return false;
-
- APInt DemandedBits = APInt::getZero(EltSizeInBits);
- APInt DemandedElts = APInt::getZero(NumElts);
- for (int I = 0; I != NumElts; ++I)
- if (!EltBits[I].isZero()) {
- DemandedBits |= EltBits[I];
- DemandedElts.setBit(I);
- }
-
- APInt KnownUndef, KnownZero;
- return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
- KnownZero, DCI) ||
- TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI);
+ APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+ APInt DemandedElts = APInt::getAllOnes(NumElts);
+ if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+ EltBits)) {
+ DemandedBits.clearAllBits();
+ DemandedElts.clearAllBits();
+ for (int I = 0; I != NumElts; ++I)
+ if (!EltBits[I].isZero()) {
+ DemandedBits |= EltBits[I];
+ DemandedElts.setBit(I);
+ }
+ }
+ return std::make_pair(DemandedBits, DemandedElts);
};
- if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+ std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
+ std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0);
+
+ if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
+ TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
}
+
+ SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Demand0.first,
+ Demand0.second, DAG);
+ SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Demand1.first,
+ Demand1.second, DAG);
+ if (NewN0 || NewN1)
+ return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
+ NewN1 ? NewN1 : N1);
}
// Attempt to combine a scalar bitmask AND with an extracted shuffle.
@@ -47127,8 +47948,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
// into:
// srl(ctlz x), log2(bitsize(x))
// Input pattern is checked by caller.
-static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
- SelectionDAG &DAG) {
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
SDValue Cmp = Op.getOperand(1);
EVT VT = Cmp.getOperand(0).getValueType();
unsigned Log2b = Log2_32(VT.getSizeInBits());
@@ -47139,7 +47959,7 @@ static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
DAG.getConstant(Log2b, dl, MVT::i8));
- return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+ return Scc;
}
// Try to transform:
@@ -47199,11 +48019,10 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
// or(srl(ctlz),srl(ctlz)).
// The dag combiner can then fold it into:
// srl(or(ctlz, ctlz)).
- EVT VT = OR->getValueType(0);
- SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+ SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
SDValue Ret, NewRHS;
- if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
- Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+ if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
if (!Ret)
return SDValue();
@@ -47216,21 +48035,18 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
- NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
if (!NewRHS)
return SDValue();
- Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
}
- if (Ret)
- Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
-
- return Ret;
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
}
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
- SDValue And1_L, SDValue And1_R, SDLoc DL,
- SelectionDAG &DAG) {
+ SDValue And1_L, SDValue And1_R,
+ const SDLoc &DL, SelectionDAG &DAG) {
if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
return SDValue();
SDValue NotOp = And0_L->getOperand(0);
@@ -47352,7 +48168,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
N1.getConstantOperandAPInt(1) == HalfElts &&
- DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+ DAG.MaskedVectorIsZero(N0, UpperElts)) {
return DAG.getNode(
ISD::CONCAT_VECTORS, dl, VT,
extractSubVector(N0, 0, DAG, dl, HalfElts),
@@ -47360,7 +48176,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
}
if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
N0.getConstantOperandAPInt(1) == HalfElts &&
- DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+ DAG.MaskedVectorIsZero(N1, UpperElts)) {
return DAG.getNode(
ISD::CONCAT_VECTORS, dl, VT,
extractSubVector(N1, 0, DAG, dl, HalfElts),
@@ -47389,9 +48205,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (!EltBits[I].isAllOnes())
DemandedElts.setBit(I);
- APInt KnownUndef, KnownZero;
- return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
- KnownZero, DCI);
+ return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
};
if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
if (N->getOpcode() != ISD::DELETED_NODE)
@@ -47618,7 +48432,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
// clip to 0-255.
if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
InVT == MVT::v16i32 && VT == MVT::v16i8) {
- if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
DL, DAG, Subtarget);
@@ -47643,7 +48457,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
VT.getSizeInBits() >= 64 &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
- if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
// Only do this when the result is at least 64 bits or we'll leaving
// dangling PACKSSDW nodes.
@@ -47660,7 +48474,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
Subtarget);
}
- if (auto SSatVal = detectSSatPattern(In, VT))
+ if (SDValue SSatVal = detectSSatPattern(In, VT))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
Subtarget);
}
@@ -47671,10 +48485,10 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
unsigned TruncOpc = 0;
SDValue SatVal;
- if (auto SSatVal = detectSSatPattern(In, VT)) {
+ if (SDValue SSatVal = detectSSatPattern(In, VT)) {
SatVal = SSatVal;
TruncOpc = X86ISD::VTRUNCS;
- } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+ } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
SatVal = USatVal;
TruncOpc = X86ISD::VTRUNCUS;
}
@@ -47706,7 +48520,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
-/// X86ISD::AVG instruction.
+/// ISD::AVGCEILU (AVG) instruction.
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
@@ -47769,7 +48583,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
+ return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
};
auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
@@ -47872,7 +48686,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
- Ld->getAlignment() >= 16) ||
+ Ld->getAlign() >= Align(16)) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
@@ -48340,7 +49154,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Split under-aligned vector non-temporal stores.
if (St->isNonTemporal() && StVT == VT &&
- St->getAlignment() < VT.getStoreSize()) {
+ St->getAlign().value() < VT.getStoreSize()) {
// ZMM/YMM nt-stores - either it can be stored as a series of shorter
// vectors or the legalizer can scalarize it to use MOVNTI.
if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -48374,9 +49188,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
- if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+ if (!St->isTruncatingStore() &&
(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+ StoredVal.hasOneUse() &&
TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
return EmitTruncSStore(IsSigned, St->getChain(),
@@ -48385,15 +49200,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
// Try to fold a extract_element(VTRUNC) pattern into a truncating store.
- if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+ if (!St->isTruncatingStore()) {
auto IsExtractedElement = [](SDValue V) {
- if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+ if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
V = V.getOperand(0);
unsigned Opc = V.getOpcode();
- if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
- if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
- return V.getOperand(0);
- }
+ if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
+ isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
+ V.getOperand(0).hasOneUse())
+ return V.getOperand(0);
return SDValue();
};
if (SDValue Extract = IsExtractedElement(StoredVal)) {
@@ -48531,10 +49346,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
- APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
- KnownZero, DCI)) {
+ if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
@@ -49165,7 +49978,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// PACK should still be worth it for 128-bit vectors if the sources were
// originally concatenated from subvectors.
SmallVector<SDValue> ConcatOps;
- if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+ if (VT.getSizeInBits() > 128 ||
+ !collectConcatOps(In.getNode(), ConcatOps, DAG))
return SDValue();
}
@@ -49478,9 +50292,9 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
SDValue In = N->getOperand(0);
SDLoc DL(N);
- if (auto SSatVal = detectSSatPattern(In, VT))
+ if (SDValue SSatVal = detectSSatPattern(In, VT))
return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
- if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+ if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -49567,10 +50381,14 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (!UndefElts[I] && !EltBits[I].isSignMask())
return SDValue();
- return peekThroughBitcasts(Op0);
+ // Only allow bitcast from correctly-sized constant.
+ Op0 = peekThroughBitcasts(Op0);
+ if (Op0.getScalarValueSizeInBits() == ScalarSize)
+ return Op0;
}
- }
- }
+ break;
+ } // case
+ } // switch
return SDValue();
}
@@ -50074,10 +50892,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
- KnownZero, DCI))
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
// Convert a full vector load into vzload when not all bits are needed.
@@ -50144,26 +50960,70 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
MVT VT = N->getSimpleValueType(0);
+ // ANDNP(undef, x) -> 0
+ // ANDNP(x, undef) -> 0
+ if (N0.isUndef() || N1.isUndef())
+ return DAG.getConstant(0, SDLoc(N), VT);
+
// ANDNP(0, x) -> x
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
- return N->getOperand(1);
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ return N1;
// ANDNP(x, 0) -> 0
- if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+ if (ISD::isBuildVectorAllZeros(N1.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
- if (SDValue Not = IsNOT(N->getOperand(0), DAG))
- return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
- N->getOperand(1));
+ if (SDValue Not = IsNOT(N0, DAG))
+ return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
+
+ // TODO: Constant fold NOT(N0) to allow us to use AND.
+ // TODO: Do this in IsNOT with suitable oneuse checks?
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
+
+ // If either operand is a constant mask, then only the elements that aren't
+ // zero are actually demanded by the other operand.
+ auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
+ APInt UndefElts;
+ SmallVector<APInt> EltBits;
+ int NumElts = VT.getVectorNumElements();
+ int EltSizeInBits = VT.getScalarSizeInBits();
+ APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+ APInt DemandedElts = APInt::getAllOnes(NumElts);
+ if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
+ EltBits)) {
+ DemandedBits.clearAllBits();
+ DemandedElts.clearAllBits();
+ for (int I = 0; I != NumElts; ++I)
+ if ((Invert && !EltBits[I].isAllOnes()) ||
+ (!Invert && !EltBits[I].isZero())) {
+ DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
+ DemandedElts.setBit(I);
+ }
+ }
+ return std::make_pair(DemandedBits, DemandedElts);
+ };
+ std::pair<APInt, APInt> Demand0 = GetDemandedMasks(N1);
+ std::pair<APInt, APInt> Demand1 = GetDemandedMasks(N0, true);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(N0, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedVectorElts(N1, Demand1.second, DCI) ||
+ TLI.SimplifyDemandedBits(N0, Demand0.first, Demand0.second, DCI) ||
+ TLI.SimplifyDemandedBits(N1, Demand1.first, Demand1.second, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
}
return SDValue();
@@ -50191,11 +51051,9 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
- APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getLowBitsSet(8, 4);
- if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
- DCI)) {
+ if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
@@ -50453,110 +51311,6 @@ static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
return Res;
}
-// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
-// This is more or less the reverse of combineBitcastvxi1.
-static SDValue
-combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- unsigned Opcode = N->getOpcode();
- if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
- Opcode != ISD::ANY_EXTEND)
- return SDValue();
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- EVT VT = N->getValueType(0);
- EVT SVT = VT.getScalarType();
- EVT InSVT = N0.getValueType().getScalarType();
- unsigned EltSizeInBits = SVT.getSizeInBits();
-
- // Input type must be extending a bool vector (bit-casted from a scalar
- // integer) to legal integer types.
- if (!VT.isVector())
- return SDValue();
- if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
- return SDValue();
- if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
- return SDValue();
-
- SDValue N00 = N0.getOperand(0);
- EVT SclVT = N0.getOperand(0).getValueType();
- if (!SclVT.isScalarInteger())
- return SDValue();
-
- SDLoc DL(N);
- SDValue Vec;
- SmallVector<int, 32> ShuffleMask;
- unsigned NumElts = VT.getVectorNumElements();
- assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
-
- // Broadcast the scalar integer to the vector elements.
- if (NumElts > EltSizeInBits) {
- // If the scalar integer is greater than the vector element size, then we
- // must split it down into sub-sections for broadcasting. For example:
- // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
- // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
- assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
- unsigned Scale = NumElts / EltSizeInBits;
- EVT BroadcastVT =
- EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
- Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
- Vec = DAG.getBitcast(VT, Vec);
-
- for (unsigned i = 0; i != Scale; ++i)
- ShuffleMask.append(EltSizeInBits, i);
- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
- } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
- (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
- // If we have register broadcast instructions, use the scalar size as the
- // element type for the shuffle. Then cast to the wider element type. The
- // widened bits won't be used, and this might allow the use of a broadcast
- // load.
- assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
- unsigned Scale = EltSizeInBits / NumElts;
- EVT BroadcastVT =
- EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
- Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
- ShuffleMask.append(NumElts * Scale, 0);
- Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
- Vec = DAG.getBitcast(VT, Vec);
- } else {
- // For smaller scalar integers, we can simply any-extend it to the vector
- // element size (we don't care about the upper bits) and broadcast it to all
- // elements.
- SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
- Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
- ShuffleMask.append(NumElts, 0);
- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
- }
-
- // Now, mask the relevant bit in each element.
- SmallVector<SDValue, 32> Bits;
- for (unsigned i = 0; i != NumElts; ++i) {
- int BitIdx = (i % EltSizeInBits);
- APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
- Bits.push_back(DAG.getConstant(Bit, DL, SVT));
- }
- SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
- Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
-
- // Compare against the bitmask and extend the result.
- EVT CCVT = VT.changeVectorElementType(MVT::i1);
- Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
- Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
-
- // For SEXT, this is now done, otherwise shift the result down for
- // zero-extension.
- if (Opcode == ISD::SIGN_EXTEND)
- return Vec;
- return DAG.getNode(ISD::SRL, DL, VT, Vec,
- DAG.getConstant(EltSizeInBits - 1, DL, VT));
-}
-
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
@@ -50636,7 +51390,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
- if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
+ DAG, DCI, Subtarget))
return V;
if (VT.isVector()) {
@@ -50790,7 +51545,8 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
- if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
+ DAG, DCI, Subtarget))
return V;
if (VT.isVector())
@@ -50832,7 +51588,7 @@ static bool isOrXorXorTree(SDValue X, bool Root = true) {
/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
/// expansion.
-template<typename F>
+template <typename F>
static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
SDValue Op0 = X.getOperand(0);
@@ -50845,7 +51601,8 @@ static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
if (HasPT)
return DAG.getNode(ISD::OR, DL, VecVT, A, B);
return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
- } else if (X.getOpcode() == ISD::XOR) {
+ }
+ if (X.getOpcode() == ISD::XOR) {
SDValue A = SToV(Op0);
SDValue B = SToV(Op1);
if (VecVT != CmpVT)
@@ -51134,6 +51891,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
LHS.getValueType() == MVT::v4f32)
return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+ // X pred 0.0 --> X pred -X
+ // If the negation of X already exists, use it in the comparison. This removes
+ // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
+ // instructions in patterns with a 'select' node.
+ if (isNullFPScalarOrVectorConst(RHS)) {
+ SDVTList FNegVT = DAG.getVTList(OpVT);
+ if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
+ return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
+ }
+
return SDValue();
}
@@ -51145,16 +51912,18 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
MVT VT = N->getSimpleValueType(0);
unsigned NumBits = VT.getScalarSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
+ assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
// Perform constant folding.
- if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
- assert(VT == MVT::i32 && "Unexpected result type");
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
APInt Imm(32, 0);
- for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
- if (!Src.getOperand(Idx).isUndef() &&
- Src.getConstantOperandAPInt(Idx).isNegative())
+ for (unsigned Idx = 0; Idx != NumElts; ++Idx)
+ if (!UndefElts[Idx] && EltBits[Idx].isNegative())
Imm.setBit(Idx);
- }
+
return DAG.getConstant(Imm, SDLoc(N), VT);
}
@@ -51713,8 +52482,6 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
CC = (X86::CondCode)User->getConstantOperandVal(0);
break;
case X86ISD::BRCOND:
- CC = (X86::CondCode)User->getConstantOperandVal(2);
- break;
case X86ISD::CMOV:
CC = (X86::CondCode)User->getConstantOperandVal(2);
break;
@@ -51743,10 +52510,14 @@ static bool onlyZeroFlagUsed(SDValue Flags) {
default:
// Be conservative.
return false;
- case X86ISD::SETCC: CCOpNo = 0; break;
- case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
- case X86ISD::BRCOND: CCOpNo = 2; break;
- case X86ISD::CMOV: CCOpNo = 2; break;
+ case X86ISD::SETCC:
+ case X86ISD::SETCC_CARRY:
+ CCOpNo = 0;
+ break;
+ case X86ISD::BRCOND:
+ case X86ISD::CMOV:
+ CCOpNo = 2;
+ break;
}
X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
@@ -51757,6 +52528,215 @@ static bool onlyZeroFlagUsed(SDValue Flags) {
return true;
}
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
+ SDValue X, SDValue Y,
+ SelectionDAG &DAG,
+ bool ZeroSecondOpOnly = false) {
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ // Look through a one-use zext.
+ if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
+ Y = Y.getOperand(0);
+
+ X86::CondCode CC;
+ SDValue EFLAGS;
+ if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
+ CC = (X86::CondCode)Y.getConstantOperandVal(0);
+ EFLAGS = Y.getOperand(1);
+ } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
+ Y.hasOneUse()) {
+ EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
+ }
+
+ if (!EFLAGS)
+ return SDValue();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+ if (ConstantX && !ZeroSecondOpOnly) {
+ if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
+ (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
+ // This is a complicated way to get -1 or 0 from the carry flag:
+ // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS);
+ }
+
+ if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
+ (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Swap the operands of a SUB, and we have the same pattern as above.
+ // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+ // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ NewEFLAGS);
+ }
+ }
+ }
+
+ if (CC == X86::COND_B) {
+ // X + SETB Z --> adc X, 0
+ // X - SETB Z --> sbb X, 0
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), EFLAGS);
+ }
+
+ if (ZeroSecondOpOnly)
+ return SDValue();
+
+ if (CC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub =
+ DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), NewEFLAGS);
+ }
+ }
+
+ if (CC == X86::COND_AE) {
+ // X + SETAE --> sbb X, -1
+ // X - SETAE --> adc X, -1
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), EFLAGS);
+ }
+
+ if (CC == X86::COND_BE) {
+ // X + SETBE --> sbb X, -1
+ // X - SETBE --> adc X, -1
+ // Try to convert COND_BE into COND_AE in an attempt to facilitate
+ // materializing "setae reg".
+ //
+ // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub =
+ DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), NewEFLAGS);
+ }
+ }
+
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
+ !X86::isZeroNode(EFLAGS.getOperand(1)) ||
+ !EFLAGS.getOperand(0).getValueType().isInteger())
+ return SDValue();
+
+ SDValue Z = EFLAGS.getOperand(0);
+ EVT ZVT = Z.getValueType();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ if (ConstantX) {
+ // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+ // fake operands:
+ // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+ // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+ if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
+ (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
+ SDValue Zero = DAG.getConstant(0, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ }
+
+ // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+ // with fake operands:
+ // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+ // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+ if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
+ (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ Cmp1.getValue(1));
+ }
+ }
+
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
+
+ // Add the flags type for ADC/SBB nodes.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+ // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+ // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
+ if (CC == X86::COND_NE)
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+ DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
+
+ // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
+ // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+ DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+ bool IsSub = N->getOpcode() == ISD::SUB;
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
+ return ADCOrSBB;
+
+ // Commute and try again (negate the result for subtracts).
+ if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
+ if (IsSub)
+ ADCOrSBB =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
+ return ADCOrSBB;
+ }
+
+ return SDValue();
+}
+
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
// Only handle test patterns.
if (!isNullConstant(N->getOperand(1)))
@@ -51792,6 +52772,16 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
}
}
+ // Peek through any zero-extend if we're only testing for a zero result.
+ if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() >= 8 &&
+ DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
+ DAG.getConstant(0, dl, SrcVT));
+ }
+
// Look for a truncate.
if (Op.getOpcode() != ISD::TRUNCATE)
return SDValue();
@@ -51867,7 +52857,8 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
MVT VT = LHS.getSimpleValueType();
- unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
+ bool IsSub = X86ISD::SUB == N->getOpcode();
+ unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
// If we don't use the flag result, simplify back to a generic ADD/SUB.
if (!N->hasAnyUseOfValue(1)) {
@@ -51889,26 +52880,29 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
MatchGeneric(LHS, RHS, false);
MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
- return SDValue();
+ // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
+ // EFLAGS result doesn't change.
+ return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
+ /*ZeroSecondOpOnly*/ true);
}
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue BorrowIn = N->getOperand(2);
+
+ if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
- return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
- N->getOperand(0), N->getOperand(1),
- Flags);
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
}
// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
// iff the flag result is dead.
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+ if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
!N->hasAnyUseOfValue(1))
- return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
- Op0.getOperand(1), N->getOperand(2));
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
+ LHS.getOperand(1), BorrowIn);
return SDValue();
}
@@ -51916,228 +52910,60 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue CarryIn = N->getOperand(2);
+ auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
+ auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+
+ // Canonicalize constant to RHS.
+ if (LHSC && !RHSC)
+ return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
+ CarryIn);
+
// If the LHS and RHS of the ADC node are zero, then it can't overflow and
// the result is either zero or one (depending on the input carry bit).
// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
- if (X86::isZeroNode(N->getOperand(0)) &&
- X86::isZeroNode(N->getOperand(1)) &&
+ if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
// We don't have a good way to replace an EFLAGS use, so only do this when
// dead right now.
SDValue(N, 1).use_empty()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
- SDValue Res1 =
- DAG.getNode(ISD::AND, DL, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
- N->getOperand(2)),
- DAG.getConstant(1, DL, VT));
+ SDValue Res1 = DAG.getNode(
+ ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
+ DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
}
- if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
- MVT VT = N->getSimpleValueType(0);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
- return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
- N->getOperand(0), N->getOperand(1),
- Flags);
- }
-
- return SDValue();
-}
-
-/// If this is an add or subtract where one operand is produced by a cmp+setcc,
-/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
-/// with CMP+{ADC, SBB}.
-static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
- bool IsSub = N->getOpcode() == ISD::SUB;
- SDValue X = N->getOperand(0);
- SDValue Y = N->getOperand(1);
-
- // If this is an add, canonicalize a zext operand to the RHS.
- // TODO: Incomplete? What if both sides are zexts?
- if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
- Y.getOpcode() != ISD::ZERO_EXTEND)
- std::swap(X, Y);
-
- // Look through a one-use zext.
- bool PeekedThroughZext = false;
- if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
- Y = Y.getOperand(0);
- PeekedThroughZext = true;
- }
-
- // If this is an add, canonicalize a setcc operand to the RHS.
- // TODO: Incomplete? What if both sides are setcc?
- // TODO: Should we allow peeking through a zext of the other operand?
- if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
- Y.getOpcode() != X86ISD::SETCC)
- std::swap(X, Y);
-
- if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
- return SDValue();
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
- X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
-
- // If X is -1 or 0, then we have an opportunity to avoid constants required in
- // the general case below.
- auto *ConstantX = dyn_cast<ConstantSDNode>(X);
- if (ConstantX) {
- if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
- (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
- // This is a complicated way to get -1 or 0 from the carry flag:
- // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
- // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
- return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
- Y.getOperand(1));
- }
-
- if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
- (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
- SDValue EFLAGS = Y->getOperand(1);
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- // Swap the operands of a SUB, and we have the same pattern as above.
- // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
- // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
- SDValue NewSub = DAG.getNode(
- X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
- return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
- NewEFLAGS);
- }
- }
- }
-
- if (CC == X86::COND_B) {
- // X + SETB Z --> adc X, 0
- // X - SETB Z --> sbb X, 0
- return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
- DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(0, DL, VT), Y.getOperand(1));
- }
-
- if (CC == X86::COND_A) {
- SDValue EFLAGS = Y.getOperand(1);
- // Try to convert COND_A into COND_B in an attempt to facilitate
- // materializing "setb reg".
- //
- // Do not flip "e > c", where "c" is a constant, because Cmp instruction
- // cannot take an immediate as its first operand.
- //
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
- EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
- return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
- DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(0, DL, VT), NewEFLAGS);
- }
- }
-
- if (CC == X86::COND_AE) {
- // X + SETAE --> sbb X, -1
- // X - SETAE --> adc X, -1
- return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
- DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(-1, DL, VT), Y.getOperand(1));
- }
-
- if (CC == X86::COND_BE) {
- // X + SETBE --> sbb X, -1
- // X - SETBE --> adc X, -1
- SDValue EFLAGS = Y.getOperand(1);
- // Try to convert COND_BE into COND_AE in an attempt to facilitate
- // materializing "setae reg".
- //
- // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
- // cannot take an immediate as its first operand.
- //
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub = DAG.getNode(
- X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
- return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
- DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(-1, DL, VT), NewEFLAGS);
- }
+ // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
+ // iff the flag result is dead.
+ // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
+ if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
+ SDLoc DL(N);
+ APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
+ return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
+ DAG.getConstant(0, DL, LHS.getValueType()),
+ DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
}
- if (CC != X86::COND_E && CC != X86::COND_NE)
- return SDValue();
-
- SDValue Cmp = Y.getOperand(1);
- if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
- !X86::isZeroNode(Cmp.getOperand(1)) ||
- !Cmp.getOperand(0).getValueType().isInteger())
- return SDValue();
-
- SDValue Z = Cmp.getOperand(0);
- EVT ZVT = Z.getValueType();
-
- // If X is -1 or 0, then we have an opportunity to avoid constants required in
- // the general case below.
- if (ConstantX) {
- // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
- // fake operands:
- // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
- // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
- if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
- (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
- SDValue Zero = DAG.getConstant(0, DL, ZVT);
- SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
- return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
- SDValue(Neg.getNode(), 1));
- }
-
- // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
- // with fake operands:
- // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
- // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
- if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
- (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
- SDValue One = DAG.getConstant(1, DL, ZVT);
- SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
- SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
- return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
- Cmp1.getValue(1));
- }
+ if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
}
- // (cmp Z, 1) sets the carry flag if Z is 0.
- SDValue One = DAG.getConstant(1, DL, ZVT);
- SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
- SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
-
- // Add the flags type for ADC/SBB nodes.
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
- // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
- // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
- if (CC == X86::COND_NE)
- return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
- DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
+ // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
+ // iff the flag result is dead.
+ if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
+ !N->hasAnyUseOfValue(1))
+ return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
+ LHS.getOperand(1), CarryIn);
- // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
- // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
- return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
- DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
+ return SDValue();
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -52432,7 +53258,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
/// Try to fold those constants into an 'add' instruction to reduce instruction
/// count. We do this with CMOV rather the generic 'select' because there are
/// earlier folds that may be used to turn select-of-constants into logic hacks.
-static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// If an operand is zero, add-of-0 gets simplified away, so that's clearly
// better because we eliminate 1-2 instructions. This transform is still
// an improvement without zero operands because we trade 2 move constants and
@@ -52457,6 +53284,11 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
if (!isSuitableCmov(Cmov))
return SDValue();
+ // Don't remove a load folding opportunity for the add. That would neutralize
+ // any improvements from removing constant materializations.
+ if (X86::mayFoldLoad(OtherOp, Subtarget))
+ return SDValue();
+
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue FalseOp = Cmov.getOperand(0);
@@ -52499,7 +53331,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
- if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
+ if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
return Select;
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
@@ -52535,6 +53367,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
}
}
+ // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
+ if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
+ X86::isZeroNode(Op0.getOperand(1))) {
+ assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
+ return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
+ Op0.getOperand(0), Op0.getOperand(2));
+ }
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -52617,6 +53457,25 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
return V;
+ // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
+ if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
+ X86::isZeroNode(Op1.getOperand(1))) {
+ assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
+ return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
+ Op1.getOperand(0), Op1.getOperand(2));
+ }
+
+ // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
+ // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
+ if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
+ !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
+ assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
+ SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
+ Op1.getOperand(1), Op1.getOperand(2));
+ return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
+ Op1.getOperand(0));
+ }
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -52745,6 +53604,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
Subs.push_back(SubOp.getOperand(I));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
};
+ auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
+ for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
+ SDValue Sub = SubOps[I].getOperand(Op);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+ if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Sub.getOperand(0).getValueType() != VT ||
+ Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
+ return false;
+ }
+ return true;
+ };
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
@@ -52802,6 +53672,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
DAG.getTargetConstant(Idx, DL, MVT::i8));
}
break;
+ case X86ISD::PSHUFB:
+ if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1));
+ }
+ break;
case X86ISD::VPERMV3:
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
MVT OpVT = Op0.getSimpleValueType();
@@ -52920,6 +53798,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
}
break;
+ case ISD::VSELECT:
+ case X86ISD::BLENDV:
+ if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasInt256()) &&
+ IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
+ EVT SelVT = Ops[0].getOperand(0).getValueType();
+ SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
+ ConcatSubOperand(VT, Ops, 1),
+ ConcatSubOperand(VT, Ops, 2));
+ }
+ break;
}
}
@@ -52937,12 +53828,29 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
+ // Attempt to fold target constant loads.
+ if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
+ SmallVector<APInt> EltBits;
+ APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
+ for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
+ APInt OpUndefElts;
+ SmallVector<APInt> OpEltBits;
+ if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
+ OpEltBits, true, false))
+ break;
+ EltBits.append(OpEltBits);
+ UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
+ }
+ if (EltBits.size() == VT.getVectorNumElements())
+ return getConstVector(EltBits, UndefElts, VT, DAG, DL);
+ }
+
return SDValue();
}
-static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -52961,9 +53869,9 @@ static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -53044,7 +53952,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
- if (collectConcatOps(N, SubVectorOps)) {
+ if (collectConcatOps(N, SubVectorOps, DAG)) {
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
@@ -53103,10 +54011,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
/// This function should only be called with legal types (otherwise, the calls
/// to get simple value types will assert).
static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
- SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
+ SDValue Sel = Ext->getOperand(0);
SmallVector<SDValue, 4> CatOps;
if (Sel.getOpcode() != ISD::VSELECT ||
- !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
return SDValue();
// Note: We assume simple value types because this should only be called with
@@ -53154,9 +54062,9 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
return DAG.getBitcast(VT, NarrowSel);
}
-static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
// For AVX1 only, if we are extracting from a 256-bit and+not (which will
// eventually get combined/lowered into ANDNP) with a concatenated operand,
// split the 'and' into 128-bit ops to avoid the concatenate and extract.
@@ -53177,6 +54085,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
EVT InVecVT = InVec.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
unsigned InSizeInBits = InVecVT.getSizeInBits();
+ unsigned NumSubElts = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -53214,22 +54123,24 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
}
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getBuildVector(
- VT, SDLoc(N),
- InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
+ return DAG.getBuildVector(VT, SDLoc(N),
+ InVec->ops().slice(IdxVal, NumSubElts));
- // If we are extracting from an insert into a zero vector, replace with a
- // smaller insert into zero if we don't access less than the original
- // subvector. Don't do this for i1 vectors.
+ // If we are extracting from an insert into a larger vector, replace with a
+ // smaller insert if we don't access less than the original subvector. Don't
+ // do this for i1 vectors.
+ // TODO: Relax the matching indices requirement?
if (VT.getVectorElementType() != MVT::i1 &&
- InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
- InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
- ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+ InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
+ IdxVal == InVec.getConstantOperandVal(2) &&
InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
SDLoc DL(N);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL),
- InVec.getOperand(1), InVec.getOperand(2));
+ SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
+ InVec.getOperand(0), N->getOperand(1));
+ unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
+ InVec.getOperand(1),
+ DAG.getVectorIdxConstant(NewIdxVal, DL));
}
// If we're extracting an upper subvector from a broadcast we should just
@@ -53246,8 +54157,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
// Attempt to extract from the source of a shuffle vector.
- if ((InSizeInBits % SizeInBits) == 0 &&
- (IdxVal % VT.getVectorNumElements()) == 0) {
+ if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
SmallVector<int, 32> ShuffleMask;
SmallVector<int, 32> ScaledMask;
SmallVector<SDValue, 2> ShuffleInputs;
@@ -53255,7 +54165,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Decode the shuffle mask and scale it so its shuffling subvectors.
if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
- unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+ unsigned SubVecIdx = IdxVal / NumSubElts;
if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
return DAG.getUNDEF(VT);
if (ScaledMask[SubVecIdx] == SM_SentinelZero)
@@ -53263,7 +54173,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
if (Src.getValueSizeInBits() == InSizeInBits) {
unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
- unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+ unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
SDLoc(N), SizeInBits);
}
@@ -53273,8 +54183,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
unsigned InOpcode = InVec.getOpcode();
- if (IdxVal == 0 && InVec.hasOneUse()) {
- if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
+ if (InVec.hasOneUse()) {
+ if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -53291,7 +54201,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
}
}
- if ((InOpcode == ISD::ANY_EXTEND ||
+ if (IdxVal == 0 &&
+ (InOpcode == ISD::ANY_EXTEND ||
InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
InOpcode == ISD::ZERO_EXTEND ||
InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
@@ -53306,7 +54217,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
return DAG.getNode(ExtOp, DL, VT, Ext);
}
- if (InOpcode == ISD::VSELECT &&
+ if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
InVec.getOperand(0).getValueType().is256BitVector() &&
InVec.getOperand(1).getValueType().is256BitVector() &&
InVec.getOperand(2).getValueType().is256BitVector()) {
@@ -53316,7 +54227,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
- if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+ if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
(VT.is128BitVector() || VT.is256BitVector())) {
SDLoc DL(N);
SDValue InVecSrc = InVec.getOperand(0);
@@ -53324,6 +54235,13 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
return DAG.getNode(InOpcode, DL, VT, Ext);
}
+ if (InOpcode == X86ISD::MOVDDUP &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDLoc DL(N);
+ SDValue Ext0 =
+ extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext0);
+ }
}
// Always split vXi64 logical shifts where we're extracting the upper 32-bits
@@ -53476,11 +54394,9 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
ISD::isBuildVectorAllZeros(RHS.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
- APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
- KnownZero, DCI))
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
return SDValue();
@@ -53494,6 +54410,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
unsigned Opcode = N->getOpcode();
unsigned InOpcode = In.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc DL(N);
// Try to merge vector loads and extend_inreg to an extload.
if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
@@ -53506,10 +54423,9 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
: ISD::ZEXTLOAD;
EVT MemVT = VT.changeVectorElementType(SVT);
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
- SDValue Load =
- DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
- Ld->getMemOperand()->getFlags());
+ SDValue Load = DAG.getExtLoad(
+ Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+ MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
}
@@ -53518,7 +54434,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
if (Opcode == InOpcode)
- return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+ return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
// -> EXTEND_VECTOR_INREG(X).
@@ -53527,12 +54443,26 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
In.getOperand(0).getOperand(0).getValueSizeInBits() ==
In.getValueSizeInBits())
- return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+ return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
- // Attempt to combine as a shuffle.
- // TODO: General ZERO_EXTEND_VECTOR_INREG support.
- if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
- (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
+ // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
+ // TODO: Move to DAGCombine?
+ if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
+ In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
+ In.getValueSizeInBits() == VT.getSizeInBits()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
+ EVT EltVT = In.getOperand(0).getValueType();
+ SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
+ for (unsigned I = 0; I != NumElts; ++I)
+ Elts[I * Scale] = In.getOperand(I);
+ return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
+ }
+
+ // Attempt to combine as a shuffle on SSE41+ targets.
+ if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+ Subtarget.hasSSE41()) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -53549,11 +54479,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
- APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
- KnownZero, DCI))
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
return SDValue();
@@ -53781,11 +54709,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::CONCAT_VECTORS:
- return combineConcatVectors(N, DAG, DCI, Subtarget);
+ return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
- return combineInsertSubvector(N, DAG, DCI, Subtarget);
+ return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
- return combineExtractSubvector(N, DAG, DCI, Subtarget);
+ return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
@@ -54397,37 +55325,37 @@ TargetLowering::ConstraintWeight
weight = CW_Register;
break;
case 'I':
- if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
if (C->getZExtValue() <= 31)
weight = CW_Constant;
}
break;
case 'J':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 63)
weight = CW_Constant;
}
break;
case 'K':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
weight = CW_Constant;
}
break;
case 'L':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
weight = CW_Constant;
}
break;
case 'M':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 3)
weight = CW_Constant;
}
break;
case 'N':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xff)
weight = CW_Constant;
}
@@ -54439,14 +55367,14 @@ TargetLowering::ConstraintWeight
}
break;
case 'e':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80000000LL) &&
(C->getSExtValue() <= 0x7fffffffLL))
weight = CW_Constant;
}
break;
case 'Z':
- if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+ if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xffffffff)
weight = CW_Constant;
}
@@ -54511,7 +55439,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
switch (ConstraintLetter) {
default: break;
case 'I':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 31) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54520,7 +55448,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'J':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 63) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54529,7 +55457,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'K':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (isInt<8>(C->getSExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54538,7 +55466,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'L':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
@@ -54548,7 +55476,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'M':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 3) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54557,7 +55485,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'N':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54566,7 +55494,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
return;
case 'O':
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 127) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
@@ -54576,7 +55504,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'e': {
// 32-bit signed value
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getSExtValue())) {
// Widen to 64 bits here to get it sign extended.
@@ -54590,7 +55518,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
case 'Z': {
// 32-bit unsigned value
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getZExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
@@ -54604,7 +55532,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
}
case 'i': {
// Literal immediates are always ok.
- if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
BooleanContent BCont = getBooleanContents(MVT::i64);
ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
@@ -54617,8 +55545,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// In any sort of PIC mode addresses need to be computed at runtime by
// adding in a register or some sort of table lookup. These can't
- // be used as immediates.
- if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+ // be used as immediates. BlockAddresses are fine though.
+ if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
+ !isa<BlockAddressSDNode>(Op))
return;
// If we are in non-pic codegen mode, we allow the address of a global (with
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3f6d567d3f4d..af110884049b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -249,9 +249,6 @@ namespace llvm {
SCALEFS,
SCALEFS_RND,
- // Unsigned Integer average.
- AVG,
-
/// Integer horizontal add/sub.
HADD,
HSUB,
@@ -790,6 +787,9 @@ namespace llvm {
LOR,
LXOR,
LAND,
+ LBTS,
+ LBTC,
+ LBTR,
// Load, scalar_to_vector, and zero extend.
VZEXT_LOAD,
@@ -1039,10 +1039,7 @@ namespace llvm {
bool isCtlzFast() const override;
- bool hasBitPreservingFPLogic(EVT VT) const override {
- return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
- (VT == MVT::f16 && X86ScalarSSEf16);
- }
+ bool hasBitPreservingFPLogic(EVT VT) const override;
bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
// If the pair to store is a mixture of float and int values, we will
@@ -1163,6 +1160,19 @@ namespace llvm {
APInt &UndefElts,
unsigned Depth) const override;
+ bool isTargetCanonicalConstantNode(SDValue Op) const override {
+ // Peek through bitcasts/extracts/inserts to see if we have a broadcast
+ // vector from memory.
+ while (Op.getOpcode() == ISD::BITCAST ||
+ Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+ (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Op.getOperand(0).isUndef()))
+ Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+
+ return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ TargetLowering::isTargetCanonicalConstantNode(Op);
+ }
+
const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
SDValue unwrapAddress(SDValue N) const override;
@@ -1288,6 +1298,9 @@ namespace llvm {
/// from i32 to i8 but not from i32 to i16.
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+ bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+ EVT VT) const override;
+
/// Given an intrinsic, checks if on the target the intrinsic will need to map
/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
/// true and stores the intrinsic information into the IntrinsicInfo that was
@@ -1316,15 +1329,13 @@ namespace llvm {
/// Returns true if lowering to a jump table is allowed.
bool areJTsAllowed(const Function *Fn) const override;
+ MVT getPreferredSwitchConditionType(LLVMContext &Context,
+ EVT ConditionVT) const override;
+
/// If true, then instruction selection should
/// seek to shrink the FP constant of the specified type to a smaller type
/// in order to save space and / or reduce runtime.
- bool ShouldShrinkFPConstant(EVT VT) const override {
- // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
- // expensive than a straight movsd. On the other hand, it's important to
- // shrink long double fp constant since fldt is very slow.
- return !X86ScalarSSEf64 || VT == MVT::f80;
- }
+ bool ShouldShrinkFPConstant(EVT VT) const override;
/// Return true if we believe it is correct and profitable to reduce the
/// load node to a smaller type.
@@ -1333,11 +1344,7 @@ namespace llvm {
/// Return true if the specified scalar FP type is computed in an SSE
/// register, not on the X87 floating point stack.
- bool isScalarFPTypeInSSEReg(EVT VT) const {
- return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
- (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
- (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
- }
+ bool isScalarFPTypeInSSEReg(EVT VT) const;
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
@@ -1491,13 +1498,6 @@ namespace llvm {
/// make the right decision when generating code for different targets.
const X86Subtarget &Subtarget;
- /// Select between SSE or x87 floating point ops.
- /// When SSE is available, use it for f32 operations.
- /// When SSE2 is available, use it for f64 operations.
- bool X86ScalarSSEf32;
- bool X86ScalarSSEf64;
- bool X86ScalarSSEf16;
-
/// A list of legal FP immediates.
std::vector<APFloat> LegalFPImmediates;
@@ -1637,9 +1637,13 @@ namespace llvm {
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
- bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+ void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
@@ -1649,6 +1653,8 @@ namespace llvm {
bool needsCmpXchgNb(Type *MemType) const;
+ template<typename T> bool isSoftFP16(T VT) const;
+
void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB, int FI) const;
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
index e08b4b7c03c6..001aa2dcb879 100644
--- a/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -31,6 +31,7 @@
#include "X86Subtarget.h"
#include "llvm/CodeGen/IndirectThunks.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 004e6fa5ebf4..08dc514a6476 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -23,6 +23,7 @@
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/ProfileData/SampleProf.h"
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index ff8710634e89..c098122685be 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -354,10 +354,9 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
// If the shift amount is guaranteed to be in-range we can replace it with a
// generic shift.
- APInt UpperBits =
- APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
- if (llvm::MaskedValueIsZero(Amt, UpperBits,
- II.getModule()->getDataLayout())) {
+ KnownBits KnownAmt =
+ llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+ if (KnownAmt.getMaxValue().ult(BitWidth)) {
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
: Builder.CreateLShr(Vec, Amt))
: Builder.CreateAShr(Vec, Amt));
@@ -521,11 +520,10 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
// %int = bitcast <16 x i1> %cmp to i16
// %res = zext i16 %int to i32
unsigned NumElts = ArgTy->getNumElements();
- Type *IntegerVecTy = VectorType::getInteger(ArgTy);
Type *IntegerTy = Builder.getIntNTy(NumElts);
- Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
- Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+ Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
+ Res = Builder.CreateIsNeg(Res);
Res = Builder.CreateBitCast(Res, IntegerTy);
Res = Builder.CreateZExtOrTrunc(Res, ResTy);
return Res;
@@ -997,20 +995,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
- if (MaskC->getValue().isShiftedMask()) {
+ unsigned MaskIdx, MaskLen;
+ if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
// any single contingous sequence of 1s anywhere in the mask simply
// describes a subset of the input bits shifted to the appropriate
// position. Replace with the straight forward IR.
- unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
Value *Input = II.getArgOperand(0);
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
- Value *Shifted = IC.Builder.CreateLShr(Masked,
- ConstantInt::get(II.getType(),
- ShiftAmount));
+ Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
+ Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
return IC.replaceInstUsesWith(II, Shifted);
}
-
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
@@ -1042,15 +1038,15 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
- if (MaskC->getValue().isShiftedMask()) {
+
+ unsigned MaskIdx, MaskLen;
+ if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
// any single contingous sequence of 1s anywhere in the mask simply
// describes a subset of the input bits shifted to the appropriate
// position. Replace with the straight forward IR.
- unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
Value *Input = II.getArgOperand(0);
- Value *Shifted = IC.Builder.CreateShl(Input,
- ConstantInt::get(II.getType(),
- ShiftAmount));
+ Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
+ Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
return IC.replaceInstUsesWith(II, Masked);
}
@@ -1934,6 +1930,23 @@ Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
break;
}
+ // General per-element vector operations.
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256: {
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ UndefElts &= UndefElts2;
+ break;
+ }
+
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_sse2_packuswb_128:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index d825981a6b36..5da06bc87b06 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,18 +48,23 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
- def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src),
- [(int_x86_ldtilecfg_internal addr:$src)]>;
+ let isPseudo = true, mayLoad = 1, hasSideEffects = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), []>;
+ let isPseudo = true, mayLoad = 1 in
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
+ let isPseudo = true, mayLoad = 1 in
def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
+ let isPseudo = true, mayStore = 1 in
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
- let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+ let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
+ canFoldAsLoad = 1 in
def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
[(set TILE:$dst, (int_x86_tilezero_internal
GR16:$src1, GR16:$src2))]>;
@@ -67,9 +72,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
+ let mayLoad = 1 in
def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ let mayLoad = 1 in
def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
sibmem:$src2), []>;
+ let mayStore = 1 in
def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
[(int_x86_tilezero timm:$src)]>;
@@ -99,7 +107,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
}
// Pseduo instruction for RA.
- let Constraints = "$src4 = $dst" in {
+ let isPseudo = true, Constraints = "$src4 = $dst" in {
def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2, GR16:$src3, TILE:$src4,
TILE:$src5, TILE:$src6),
@@ -158,7 +166,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
[]>, VEX_4V, T8XS;
// Pseduo instruction for RA.
- let Constraints = "$src4 = $dst" in
+ let isPseudo = true, Constraints = "$src4 = $dst" in
def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2, GR16:$src3, TILE:$src4,
TILE:$src5, TILE:$src6),
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index bc67d1f89d7f..48da7b3ac882 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -476,6 +476,7 @@ let Predicates = [HasAVX512] in {
def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
}
@@ -508,25 +509,23 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
}
-let Predicates = [HasFP16] in {
-def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
-def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
-def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
-}
-
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
+ def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
+ [(set FR16X:$dst, fp16imm0)]>;
def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
[(set FR32X:$dst, fp32imm0)]>;
def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -535,12 +534,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
[(set VR128X:$dst, fp128imm0)]>;
}
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
- def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
- [(set FR16X:$dst, fp16imm0)]>;
-}
-
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
@@ -678,21 +671,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
// Codegen pattern with the alternative types insert VEC128 into VEC512
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
// Codegen pattern with the alternative types insert VEC256 into VEC512
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
- vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@@ -979,7 +972,7 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
// Codegen pattern with the alternative types extract VEC128 from VEC512
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
@@ -987,14 +980,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
// Codegen pattern with the alternative types extract VEC256 from VEC512
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
- vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@@ -1020,6 +1013,10 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
(v8i16 (VEXTRACTI128rr
(v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+ (v8f16 (VEXTRACTF128rr
+ (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
(v16i8 (VEXTRACTI128rr
(v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
@@ -1049,18 +1046,16 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
(v8i16 (VEXTRACTI32x4Z256rr
(v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+ (v8f16 (VEXTRACTF32x4Z256rr
+ (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
(v16i8 (VEXTRACTI32x4Z256rr
(v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
(iPTR 1)))>;
}
-let Predicates = [HasFP16, HasVLX] in
-def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
- (v8f16 (VEXTRACTF32x4Z256rr
- (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
- (iPTR 1)))>;
-
// Additional patterns for handling a bitcast between the vselect and the
// extract_subvector.
@@ -1478,7 +1473,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
-let Predicates = [HasFP16] in {
+let Predicates = [HasBWI] in {
def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWZrm addr:$src)>;
@@ -1487,7 +1482,7 @@ let Predicates = [HasFP16] in {
def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
(VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
}
-let Predicates = [HasVLX, HasFP16] in {
+let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
@@ -3763,6 +3758,9 @@ let Predicates = [HasBWI, NoVLX] in {
defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
+
+ defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>;
+ defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>;
}
let Predicates = [HasAVX512] in {
@@ -3852,7 +3850,7 @@ let Predicates = [HasVLX] in {
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
-let Predicates = [HasFP16] in {
+let Predicates = [HasBWI] in {
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
(VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
@@ -3887,7 +3885,7 @@ let Predicates = [HasFP16] in {
def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
(VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
}
-let Predicates = [HasFP16, HasVLX] in {
+let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
@@ -4099,14 +4097,14 @@ def : Pat<(f64 (bitconvert VK64:$src)),
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
- X86VectorVTInfo _,
- list<Predicate> prd = [HasAVX512, OptForSize]> {
- let Predicates = prd in
+ X86VectorVTInfo _, Predicate prd = HasAVX512> {
+ let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
_.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+ let Predicates = [prd] in {
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4159,6 +4157,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
[], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
NotMemoryFoldable;
+ }
}
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
@@ -4168,7 +4167,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
- [HasFP16]>,
+ HasFP16>,
VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
@@ -4338,14 +4337,9 @@ def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
addr:$srcAddr)>;
}
-defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
-defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
- (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
- (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4353,6 +4347,12 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+let Predicates = [HasFP16] in {
+defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
+defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+ (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+ (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (insert_subvector
(v32i1 immAllZerosV),
@@ -4360,6 +4360,30 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(iPTR 0))),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
GR8, sub_8bit>;
+
+defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+ (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+ (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+ (v32i1 (insert_subvector
+ (v32i1 immAllZerosV),
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ GR8, sub_8bit>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
+ (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
+ (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
+ VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+ (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
+ (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+ (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+}
+
defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (insert_subvector
(v16i1 immAllZerosV),
@@ -4385,10 +4409,6 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
(iPTR 0))), GR8, sub_8bit>;
-defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
- (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
-defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
- (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4396,13 +4416,6 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
-defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
- (v32i1 (insert_subvector
- (v32i1 immAllZerosV),
- (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
- (iPTR 0))),
- (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
- GR8, sub_8bit>;
defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (insert_subvector
(v16i1 immAllZerosV),
@@ -4428,16 +4441,6 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
(iPTR 0))), GR8, sub_8bit>;
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
- (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
- (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
- VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
- (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-
-def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
- (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
- (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
-
def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
(COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@@ -5039,7 +5042,7 @@ defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
HasBWI, 1>;
defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
SchedWriteVecIMul, HasBWI, 1>, T8PD;
-defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu,
SchedWriteVecALU, HasBWI, 1>;
defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
SchedWriteVecIMul, HasAVX512, 1>, T8PD;
@@ -11651,6 +11654,14 @@ defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+// Always select FP16 instructions if available.
+let Predicates = [HasBWI], AddedComplexity = -10 in {
+ def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>;
+ def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+ def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>;
+ def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>;
+}
+
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
//===----------------------------------------------------------------------===//
@@ -12988,7 +12999,6 @@ def : Pat<(i16 (bitconvert FR16X:$src)),
sub_16bit))>;
def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
(i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
-}
// Allow "vmovw" to use GR64
let hasSideEffects = 0 in {
@@ -12997,6 +13007,7 @@ let hasSideEffects = 0 in {
def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
}
+}
// Convert 16-bit float to i16/u16
multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 8337d2b37383..f08ecdf6afc9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -541,7 +541,7 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
Operand immoperand, SDPatternOperator immoperator,
Operand imm8operand, SDPatternOperator imm8operator,
bit hasOddOpcode, OperandSize opSize,
- bit hasREX_WPrefix> {
+ bit hasREX_W> {
/// VT - This is the value type itself.
ValueType VT = vt;
@@ -596,9 +596,9 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
/// to Opsize16. i32 sets this to OpSize32.
OperandSize OpSize = opSize;
- /// HasREX_WPrefix - This bit is set to true if the instruction should have
+ /// HasREX_W - This bit is set to true if the instruction should have
/// the 0x40 REX prefix. This is set for i64 types.
- bit HasREX_WPrefix = hasREX_WPrefix;
+ bit HasREX_W = hasREX_W;
}
def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
@@ -634,7 +634,7 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
// Infer instruction prefixes from type info.
let OpSize = typeinfo.OpSize;
- let hasREX_WPrefix = typeinfo.HasREX_WPrefix;
+ let hasREX_W = typeinfo.HasREX_W;
}
// BinOpRR - Instructions like "add reg, reg, reg".
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 330b8c7a8a43..79ac2a2d8019 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -14,7 +14,7 @@
// CMOV instructions.
let isCodeGenOnly = 1, ForceDisassemble = 1 in {
-let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
isCommutable = 1, SchedRW = [WriteCMOV] in {
def CMOV16rr
: I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
@@ -35,7 +35,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
(X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
}
-let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
def CMOV16rm
: I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
@@ -52,7 +52,7 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
"cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
timm:$cond, EFLAGS))]>, TB;
-} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst"
} // isCodeGenOnly = 1, ForceDisassemble = 1
def inv_cond_XFORM : SDNodeXForm<imm, [{
@@ -63,7 +63,7 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
// Conditional moves with folded loads with operands swapped and conditions
// inverted.
-let Predicates = [HasCMov] in {
+let Predicates = [HasCMOV] in {
def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
(CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7288ce812138..a55b95960aa6 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -544,10 +544,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
// i8 register pressure.
defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
- let Predicates = [NoCMov] in {
+ let Predicates = [NoCMOV] in {
defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
- } // Predicates = [NoCMov]
+ } // Predicates = [NoCMOV]
// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
// SSE1/SSE2.
@@ -562,12 +562,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
let Predicates = [HasMMX] in
defm _VR64 : CMOVrr_PSEUDO<VR64, x86mmx>;
- defm _FR16X : CMOVrr_PSEUDO<FR16X, f16>;
let Predicates = [HasSSE1,NoAVX512] in
defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
- let Predicates = [HasSSE2,NoAVX512] in
+ let Predicates = [HasSSE2,NoAVX512] in {
+ defm _FR16 : CMOVrr_PSEUDO<FR16, f16>;
defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ }
let Predicates = [HasAVX512] in {
+ defm _FR16X : CMOVrr_PSEUDO<FR16X, f16>;
defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
}
@@ -670,7 +672,7 @@ def OR32mi8Locked : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$zero),
Requires<[Not64BitMode]>, OpSize32, LOCK,
Sched<[WriteALURMW]>;
-let hasSideEffects = 1 in
+let hasSideEffects = 1, isMeta = 1 in
def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
"#MEMBARRIER",
[(X86MemBarrier)]>, Sched<[WriteLoad]>;
@@ -839,6 +841,38 @@ let Predicates = [UseIncDec] in {
def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
}
+// Atomic bit test.
+def X86LBTest : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+def x86bts : SDNode<"X86ISD::LBTS", X86LBTest,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btc : SDNode<"X86ISD::LBTC", X86LBTest,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
+ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ SchedRW = [WriteBitTestSetRegRMW] in {
+ def 16m : Ii8<0xBA, Form, (outs), (ins i16mem:$src1, i8imm:$src2),
+ !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 16)))]>,
+ OpSize16, TB, LOCK;
+ def 32m : Ii8<0xBA, Form, (outs), (ins i32mem:$src1, i8imm:$src2),
+ !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 32)))]>,
+ OpSize32, TB, LOCK;
+ def 64m : RIi8<0xBA, Form, (outs), (ins i64mem:$src1, i8imm:$src2),
+ !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 64)))]>,
+ TB, LOCK;
+ }
+}
+
+defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
+defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
+defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
+
// Atomic compare and swap.
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
string mnemonic, SDPatternOperator frag> {
@@ -863,7 +897,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCX8], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, usesCustomInserter = 1 in {
def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
"cmpxchg8b\t$ptr",
@@ -871,7 +905,7 @@ def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
}
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
"cmpxchg16b\t$ptr",
@@ -898,7 +932,7 @@ def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
// the instruction and we are sure we will have a valid register to restore
// the value of RBX.
let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, isPseudo = 1,
mayLoad = 1, mayStore = 1, hasSideEffects = 0,
Constraints = "$rbx_save = $dst" in {
@@ -910,7 +944,7 @@ def LCMPXCHG16B_SAVE_RBX :
// Pseudo instruction that doesn't read/write RBX. Will be turned into either
// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ Predicates = [HasCX16,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
isCodeGenOnly = 1, isPseudo = 1,
mayLoad = 1, mayStore = 1, hasSideEffects = 0,
usesCustomInserter = 1 in {
@@ -1235,6 +1269,21 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
return true;
}]>;
+def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off),
+ (X86tcret node:$ptr, node:$off), [{
+ // X86tcret args: (*chain, ptr, imm, regs..., glue)
+ unsigned NumRegs = 1;
+ const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();
+ if (isa<FrameIndexSDNode>(BasePtr))
+ NumRegs = 3;
+ else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0)))
+ NumRegs = 3;
+ for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+ if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0))
+ return false;
+ return true;
+}]>;
+
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
@@ -1242,7 +1291,8 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
// callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), timm:$off),
+// Similar to X86tcret_6regs, here we only have 1 register left
+def : Pat<(X86tcret_1reg (load addr:$dst), timm:$off),
(TCRETURNmi addr:$dst, timm:$off)>,
Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
@@ -1467,6 +1517,21 @@ def ADD64ri32_DB : I<0, Pseudo,
} // AddedComplexity, SchedRW
//===----------------------------------------------------------------------===//
+// Pattern match XOR as ADD
+//===----------------------------------------------------------------------===//
+
+// Prefer to pattern match XOR with min_signed_value as ADD at isel time.
+// ADD can be 3-addressified into an LEA instruction to avoid copies.
+let AddedComplexity = 5 in {
+def : Pat<(xor GR8:$src1, -128),
+ (ADD8ri GR8:$src1, -128)>;
+def : Pat<(xor GR16:$src1, -32768),
+ (ADD16ri GR16:$src1, -32768)>;
+def : Pat<(xor GR32:$src1, -2147483648),
+ (ADD32ri GR32:$src1, -2147483648)>;
+}
+
+//===----------------------------------------------------------------------===//
// Pattern match SUB as XOR
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 6d969962afff..aa89a6f0ff9d 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -147,7 +147,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
// These are switched from TAILJMPr/m64_REX in MCInstLower.
- let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+ let isCodeGenOnly = 1, hasREX_W = 1 in {
def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
"rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
let mayLoad = 1 in
@@ -384,7 +384,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
[]>, Sched<[WriteJumpLd]>;
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
- let hasREX_WPrefix = 1 in {
+ let hasREX_W = 1 in {
def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
[]>, Sched<[WriteJump]>;
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index e310f369be08..a68d61043c5c 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -423,9 +423,9 @@ def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
// Floating point cmovs.
class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMOV]>;
class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMOV]>;
multiclass FPCMov<PatLeaf cc> {
def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
@@ -440,7 +440,7 @@ multiclass FPCMov<PatLeaf cc> {
CondMovFP,
[(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
cc, EFLAGS))]>,
- Requires<[HasCMov]>;
+ Requires<[HasCMOV]>;
}
let SchedRW = [WriteFCMOV] in {
@@ -455,7 +455,7 @@ defm CMOVNE : FPCMov<X86_COND_NE>;
defm CMOVNP : FPCMov<X86_COND_NP>;
} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
-let Predicates = [HasCMov] in {
+let Predicates = [HasCMOV] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RSTi:$op),
"fcmovb\t{$op, %st|st, $op}">;
@@ -473,7 +473,7 @@ def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RSTi:$op),
"fcmovne\t{$op, %st|st, $op}">;
def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
"fcmovnu\t{$op, %st|st, $op}">;
-} // Predicates = [HasCMov]
+} // Predicates = [HasCMOV]
} // SchedRW
let mayRaiseFPException = 1 in {
@@ -664,22 +664,22 @@ let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
- Requires<[FPStackf32, HasCMov]>;
+ Requires<[FPStackf32, HasCMOV]>;
def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
[(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>,
- Requires<[FPStackf64, HasCMov]>;
+ Requires<[FPStackf64, HasCMOV]>;
def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
[(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>,
- Requires<[HasCMov]>;
+ Requires<[HasCMOV]>;
def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>,
- Requires<[FPStackf32, HasCMov]>;
+ Requires<[FPStackf32, HasCMOV]>;
def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
[(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>,
- Requires<[FPStackf64, HasCMov]>;
+ Requires<[FPStackf64, HasCMOV]>;
def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
[(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>,
- Requires<[HasCMov]>;
+ Requires<[HasCMOV]>;
}
let Uses = [ST0, FPCW] in {
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 226349485238..27220a8d4d99 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -292,8 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD },
{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
{ X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD },
- { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE },
- { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MMX_MOVD64from64rr, X86::MMX_MOVQ64mr, TB_FOLDED_STORE },
+ { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE },
{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 0e7033fc233a..3a44b4570e9b 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -196,7 +196,7 @@ class OpSize32 { OperandSize OpSize = OpSize32; }
class AdSize16 { AddressSize AdSize = AdSize16; }
class AdSize32 { AddressSize AdSize = AdSize32; }
class AdSize64 { AddressSize AdSize = AdSize64; }
-class REX_W { bit hasREX_WPrefix = 1; }
+class REX_W { bit hasREX_W = 1; }
class LOCK { bit hasLockPrefix = 1; }
class REP { bit hasREPPrefix = 1; }
class TB { Map OpMap = TB; }
@@ -316,7 +316,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bits<3> OpPrefixBits = OpPrefix.Value;
Map OpMap = OB; // Which opcode map does this inst have?
bits<4> OpMapBits = OpMap.Value;
- bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix?
+ bit hasREX_W = 0; // Does this inst require the REX.W prefix?
FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
Domain ExeDomain = d;
@@ -375,7 +375,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
let TSFlags{12-11} = OpPrefixBits{1-0};
let TSFlags{16-13} = OpMapBits;
- let TSFlags{17} = hasREX_WPrefix;
+ let TSFlags{17} = hasREX_W;
let TSFlags{21-18} = ImmT.Value;
let TSFlags{24-22} = FPForm.Value;
let TSFlags{25} = hasLockPrefix;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 166f1f8c3251..57ba4683c6a4 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -287,7 +287,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisSameAs<2, 1>]>;
def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
-def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4dcd886fa3b2..ec32ac2acad1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -25,13 +25,16 @@
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -137,298 +140,70 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- // By default, assume that the instruction is not data invariant.
+ if (MI.mayLoad() || MI.mayStore())
return false;
- // Some target-independent operations that trivially lower to data-invariant
- // instructions.
- case TargetOpcode::COPY:
- case TargetOpcode::INSERT_SUBREG:
- case TargetOpcode::SUBREG_TO_REG:
+ // Some target-independent operations that trivially lower to data-invariant
+ // instructions.
+ if (MI.isCopyLike() || MI.isInsertSubreg())
return true;
+ unsigned Opcode = MI.getOpcode();
+ using namespace X86;
// On x86 it is believed that imul is constant time w.r.t. the loaded data.
// However, they set flags and are perhaps the most surprisingly constant
// time operations so we call them out here separately.
- case X86::IMUL16rr:
- case X86::IMUL16rri8:
- case X86::IMUL16rri:
- case X86::IMUL32rr:
- case X86::IMUL32rri8:
- case X86::IMUL32rri:
- case X86::IMUL64rr:
- case X86::IMUL64rri32:
- case X86::IMUL64rri8:
-
+ if (isIMUL(Opcode))
+ return true;
// Bit scanning and counting instructions that are somewhat surprisingly
// constant time as they scan across bits and do other fairly complex
// operations like popcnt, but are believed to be constant time on x86.
// However, these set flags.
- case X86::BSF16rr:
- case X86::BSF32rr:
- case X86::BSF64rr:
- case X86::BSR16rr:
- case X86::BSR32rr:
- case X86::BSR64rr:
- case X86::LZCNT16rr:
- case X86::LZCNT32rr:
- case X86::LZCNT64rr:
- case X86::POPCNT16rr:
- case X86::POPCNT32rr:
- case X86::POPCNT64rr:
- case X86::TZCNT16rr:
- case X86::TZCNT32rr:
- case X86::TZCNT64rr:
-
+ if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
+ isTZCNT(Opcode))
+ return true;
// Bit manipulation instructions are effectively combinations of basic
// arithmetic ops, and should still execute in constant time. These also
// set flags.
- case X86::BLCFILL32rr:
- case X86::BLCFILL64rr:
- case X86::BLCI32rr:
- case X86::BLCI64rr:
- case X86::BLCIC32rr:
- case X86::BLCIC64rr:
- case X86::BLCMSK32rr:
- case X86::BLCMSK64rr:
- case X86::BLCS32rr:
- case X86::BLCS64rr:
- case X86::BLSFILL32rr:
- case X86::BLSFILL64rr:
- case X86::BLSI32rr:
- case X86::BLSI64rr:
- case X86::BLSIC32rr:
- case X86::BLSIC64rr:
- case X86::BLSMSK32rr:
- case X86::BLSMSK64rr:
- case X86::BLSR32rr:
- case X86::BLSR64rr:
- case X86::TZMSK32rr:
- case X86::TZMSK64rr:
-
+ if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
+ isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
+ isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
+ isTZMSK(Opcode))
+ return true;
// Bit extracting and clearing instructions should execute in constant time,
// and set flags.
- case X86::BEXTR32rr:
- case X86::BEXTR64rr:
- case X86::BEXTRI32ri:
- case X86::BEXTRI64ri:
- case X86::BZHI32rr:
- case X86::BZHI64rr:
-
+ if (isBEXTR(Opcode) || isBZHI(Opcode))
+ return true;
// Shift and rotate.
- case X86::ROL8r1:
- case X86::ROL16r1:
- case X86::ROL32r1:
- case X86::ROL64r1:
- case X86::ROL8rCL:
- case X86::ROL16rCL:
- case X86::ROL32rCL:
- case X86::ROL64rCL:
- case X86::ROL8ri:
- case X86::ROL16ri:
- case X86::ROL32ri:
- case X86::ROL64ri:
- case X86::ROR8r1:
- case X86::ROR16r1:
- case X86::ROR32r1:
- case X86::ROR64r1:
- case X86::ROR8rCL:
- case X86::ROR16rCL:
- case X86::ROR32rCL:
- case X86::ROR64rCL:
- case X86::ROR8ri:
- case X86::ROR16ri:
- case X86::ROR32ri:
- case X86::ROR64ri:
- case X86::SAR8r1:
- case X86::SAR16r1:
- case X86::SAR32r1:
- case X86::SAR64r1:
- case X86::SAR8rCL:
- case X86::SAR16rCL:
- case X86::SAR32rCL:
- case X86::SAR64rCL:
- case X86::SAR8ri:
- case X86::SAR16ri:
- case X86::SAR32ri:
- case X86::SAR64ri:
- case X86::SHL8r1:
- case X86::SHL16r1:
- case X86::SHL32r1:
- case X86::SHL64r1:
- case X86::SHL8rCL:
- case X86::SHL16rCL:
- case X86::SHL32rCL:
- case X86::SHL64rCL:
- case X86::SHL8ri:
- case X86::SHL16ri:
- case X86::SHL32ri:
- case X86::SHL64ri:
- case X86::SHR8r1:
- case X86::SHR16r1:
- case X86::SHR32r1:
- case X86::SHR64r1:
- case X86::SHR8rCL:
- case X86::SHR16rCL:
- case X86::SHR32rCL:
- case X86::SHR64rCL:
- case X86::SHR8ri:
- case X86::SHR16ri:
- case X86::SHR32ri:
- case X86::SHR64ri:
- case X86::SHLD16rrCL:
- case X86::SHLD32rrCL:
- case X86::SHLD64rrCL:
- case X86::SHLD16rri8:
- case X86::SHLD32rri8:
- case X86::SHLD64rri8:
- case X86::SHRD16rrCL:
- case X86::SHRD32rrCL:
- case X86::SHRD64rrCL:
- case X86::SHRD16rri8:
- case X86::SHRD32rri8:
- case X86::SHRD64rri8:
-
+ if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
+ isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
+ return true;
// Basic arithmetic is constant time on the input but does set flags.
- case X86::ADC8rr:
- case X86::ADC8ri:
- case X86::ADC16rr:
- case X86::ADC16ri:
- case X86::ADC16ri8:
- case X86::ADC32rr:
- case X86::ADC32ri:
- case X86::ADC32ri8:
- case X86::ADC64rr:
- case X86::ADC64ri8:
- case X86::ADC64ri32:
- case X86::ADD8rr:
- case X86::ADD8ri:
- case X86::ADD16rr:
- case X86::ADD16ri:
- case X86::ADD16ri8:
- case X86::ADD32rr:
- case X86::ADD32ri:
- case X86::ADD32ri8:
- case X86::ADD64rr:
- case X86::ADD64ri8:
- case X86::ADD64ri32:
- case X86::AND8rr:
- case X86::AND8ri:
- case X86::AND16rr:
- case X86::AND16ri:
- case X86::AND16ri8:
- case X86::AND32rr:
- case X86::AND32ri:
- case X86::AND32ri8:
- case X86::AND64rr:
- case X86::AND64ri8:
- case X86::AND64ri32:
- case X86::OR8rr:
- case X86::OR8ri:
- case X86::OR16rr:
- case X86::OR16ri:
- case X86::OR16ri8:
- case X86::OR32rr:
- case X86::OR32ri:
- case X86::OR32ri8:
- case X86::OR64rr:
- case X86::OR64ri8:
- case X86::OR64ri32:
- case X86::SBB8rr:
- case X86::SBB8ri:
- case X86::SBB16rr:
- case X86::SBB16ri:
- case X86::SBB16ri8:
- case X86::SBB32rr:
- case X86::SBB32ri:
- case X86::SBB32ri8:
- case X86::SBB64rr:
- case X86::SBB64ri8:
- case X86::SBB64ri32:
- case X86::SUB8rr:
- case X86::SUB8ri:
- case X86::SUB16rr:
- case X86::SUB16ri:
- case X86::SUB16ri8:
- case X86::SUB32rr:
- case X86::SUB32ri:
- case X86::SUB32ri8:
- case X86::SUB64rr:
- case X86::SUB64ri8:
- case X86::SUB64ri32:
- case X86::XOR8rr:
- case X86::XOR8ri:
- case X86::XOR16rr:
- case X86::XOR16ri:
- case X86::XOR16ri8:
- case X86::XOR32rr:
- case X86::XOR32ri:
- case X86::XOR32ri8:
- case X86::XOR64rr:
- case X86::XOR64ri8:
- case X86::XOR64ri32:
+ if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
+ isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
+ return true;
// Arithmetic with just 32-bit and 64-bit variants and no immediates.
- case X86::ADCX32rr:
- case X86::ADCX64rr:
- case X86::ADOX32rr:
- case X86::ADOX64rr:
- case X86::ANDN32rr:
- case X86::ANDN64rr:
+ if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode))
+ return true;
// Unary arithmetic operations.
- case X86::DEC8r:
- case X86::DEC16r:
- case X86::DEC32r:
- case X86::DEC64r:
- case X86::INC8r:
- case X86::INC16r:
- case X86::INC32r:
- case X86::INC64r:
- case X86::NEG8r:
- case X86::NEG16r:
- case X86::NEG32r:
- case X86::NEG64r:
-
+ if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
+ return true;
// Unlike other arithmetic, NOT doesn't set EFLAGS.
- case X86::NOT8r:
- case X86::NOT16r:
- case X86::NOT32r:
- case X86::NOT64r:
-
+ if (isNOT(Opcode))
+ return true;
// Various move instructions used to zero or sign extend things. Note that we
// intentionally don't support the _NOREX variants as we can't handle that
// register constraint anyways.
- case X86::MOVSX16rr8:
- case X86::MOVSX32rr8:
- case X86::MOVSX32rr16:
- case X86::MOVSX64rr8:
- case X86::MOVSX64rr16:
- case X86::MOVSX64rr32:
- case X86::MOVZX16rr8:
- case X86::MOVZX32rr8:
- case X86::MOVZX32rr16:
- case X86::MOVZX64rr8:
- case X86::MOVZX64rr16:
- case X86::MOV32rr:
-
+ if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
+ return true;
// Arithmetic instructions that are both constant time and don't set flags.
- case X86::RORX32ri:
- case X86::RORX64ri:
- case X86::SARX32rr:
- case X86::SARX64rr:
- case X86::SHLX32rr:
- case X86::SHLX64rr:
- case X86::SHRX32rr:
- case X86::SHRX64rr:
-
+ if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
+ return true;
// LEA doesn't actually access memory, and its arithmetic is constant time.
- case X86::LEA16r:
- case X86::LEA32r:
- case X86::LEA64_32r:
- case X86::LEA64r:
+ if (isLEA(Opcode))
return true;
- }
+ // By default, assume that the instruction is not data invariant.
+ return false;
}
bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
@@ -990,6 +765,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::AVX_SET0:
case X86::FsFLD0SD:
case X86::FsFLD0SS:
+ case X86::FsFLD0SH:
case X86::FsFLD0F128:
case X86::KSET0D:
case X86::KSET0Q:
@@ -1192,6 +968,102 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
return ShAmt < 4 && ShAmt > 0;
}
+static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
+ MachineInstr &CmpValDefInstr,
+ const MachineRegisterInfo *MRI,
+ MachineInstr **AndInstr,
+ const TargetRegisterInfo *TRI,
+ bool &NoSignFlag, bool &ClearsOverflowFlag) {
+ if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG)
+ return false;
+
+ if (CmpInstr.getOpcode() != X86::TEST64rr)
+ return false;
+
+ // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare`
+ // guarantees that it's analyzable only if two registers are identical.
+ assert(
+ (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
+ "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` "
+ "requires two reg operands are the same.");
+
+ // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
+ // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
+ // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
+ // redundant.
+ assert(
+ (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
+ "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG.");
+
+ // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically
+ // 0.
+ if (CmpValDefInstr.getOperand(1).getImm() != 0)
+ return false;
+
+ // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
+ // sub_32bit or sub_xmm.
+ if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
+ return false;
+
+ MachineInstr *VregDefInstr =
+ MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
+
+ assert(VregDefInstr && "Must have a definition (SSA)");
+
+ // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
+ // to simplify the subsequent analysis.
+ //
+ // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
+ // `CmpValDefInstr.getParent()`, this could be handled.
+ if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
+ return false;
+
+ if (X86::isAND(VregDefInstr->getOpcode())) {
+ // Get a sequence of instructions like
+ // %reg = and* ... // Set EFLAGS
+ // ... // EFLAGS not changed
+ // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
+ // test64rr %extended_reg, %extended_reg, implicit-def $eflags
+ //
+ // If subsequent readers use a subset of bits that don't change
+ // after `and*` instructions, it's likely that the test64rr could
+ // be optimized away.
+ for (const MachineInstr &Instr :
+ make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
+ MachineBasicBlock::iterator(CmpValDefInstr))) {
+ // There are instructions between 'VregDefInstr' and
+ // 'CmpValDefInstr' that modifies EFLAGS.
+ if (Instr.modifiesRegister(X86::EFLAGS, TRI))
+ return false;
+ }
+
+ *AndInstr = VregDefInstr;
+
+ // AND instruction will essentially update SF and clear OF, so
+ // NoSignFlag should be false in the sense that SF is modified by `AND`.
+ //
+ // However, the implementation artifically sets `NoSignFlag` to true
+ // to poison the SF bit; that is to say, if SF is looked at later, the
+ // optimization (to erase TEST64rr) will be disabled.
+ //
+ // The reason to poison SF bit is that SF bit value could be different
+ // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
+ // and is known to be 0 as a result of `TEST64rr`.
+ //
+ // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
+ // the AND instruction and using the static information to guide peephole
+ // optimization if possible. For example, it's possible to fold a
+ // conditional move into a copy if the relevant EFLAG bits could be deduced
+ // from an immediate operand of and operation.
+ //
+ NoSignFlag = true;
+ // ClearsOverflowFlag is true for AND operation (no surprise).
+ ClearsOverflowFlag = true;
+ return true;
+ }
+ return false;
+}
+
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned Opc, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
@@ -1314,8 +1186,11 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
case X86::SHL8ri:
case X86::SHL16ri: {
unsigned ShAmt = MI.getOperand(2).getImm();
- MIB.addReg(0).addImm(1ULL << ShAmt)
- .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
+ MIB.addReg(0)
+ .addImm(1LL << ShAmt)
+ .addReg(InRegLEA, RegState::Kill)
+ .addImm(0)
+ .addReg(0);
break;
}
case X86::INC8r:
@@ -1478,7 +1353,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.add(Dest)
.addReg(0)
- .addImm(1ULL << ShAmt)
+ .addImm(1LL << ShAmt)
.add(Src)
.addImm(0)
.addReg(0);
@@ -1502,7 +1377,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(0)
- .addImm(1ULL << ShAmt)
+ .addImm(1LL << ShAmt)
.addReg(SrcReg, getKillRegState(isKill))
.addImm(0)
.addReg(0);
@@ -1957,14 +1832,13 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
FMAForms[0] = FMA3Group.get132Opcode();
FMAForms[1] = FMA3Group.get213Opcode();
FMAForms[2] = FMA3Group.get231Opcode();
- unsigned FormIndex;
- for (FormIndex = 0; FormIndex < 3; FormIndex++)
- if (Opc == FMAForms[FormIndex])
- break;
// Everything is ready, just adjust the FMA opcode and return it.
- FormIndex = FormMapping[Case][FormIndex];
- return FMAForms[FormIndex];
+ for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
+ if (Opc == FMAForms[FormIndex])
+ return FMAForms[FormMapping[Case][FormIndex]];
+
+ llvm_unreachable("Illegal FMA3 format");
}
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
@@ -2141,7 +2015,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
- WorkingMI.RemoveOperand(3);
+ WorkingMI.removeOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
/*NewMI=*/false,
OpIdx1, OpIdx2);
@@ -2238,7 +2112,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(X86::MOVSDrr));
- WorkingMI.RemoveOperand(3);
+ WorkingMI.removeOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -2813,34 +2687,37 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
return false;
}
+int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
+ unsigned Opcode = MCID.getOpcode();
+ if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
+ return -1;
+ // Assume that condition code is always the last use operand.
+ unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
+ return NumUses - 1;
+}
+
+X86::CondCode X86::getCondFromMI(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ int CondNo = getCondSrcNoFromDesc(MCID);
+ if (CondNo < 0)
+ return X86::COND_INVALID;
+ CondNo += MCID.getNumDefs();
+ return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
+}
+
X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default: return X86::COND_INVALID;
- case X86::JCC_1:
- return static_cast<X86::CondCode>(
- MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
- }
+ return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+ : X86::COND_INVALID;
}
-/// Return condition code of a SETCC opcode.
X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default: return X86::COND_INVALID;
- case X86::SETCCr: case X86::SETCCm:
- return static_cast<X86::CondCode>(
- MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
- }
+ return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+ : X86::COND_INVALID;
}
-/// Return condition code of a CMov opcode.
X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default: return X86::COND_INVALID;
- case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
- case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
- return static_cast<X86::CondCode>(
- MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
- }
+ return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+ : X86::COND_INVALID;
}
/// Return the inverse of the specified condition,
@@ -3166,8 +3043,7 @@ bool X86InstrInfo::AnalyzeBranchImpl(
}
// If the block has any instructions after a JMP, delete them.
- while (std::next(I) != MBB.end())
- std::next(I)->eraseFromParent();
+ MBB.erase(std::next(I), MBB.end());
Cond.clear();
FBB = nullptr;
@@ -3464,7 +3340,7 @@ bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
Register FalseReg, int &CondCycles,
int &TrueCycles, int &FalseCycles) const {
// Not all subtargets have cmov instructions.
- if (!Subtarget.hasCMov())
+ if (!Subtarget.canUseCMOV())
return false;
if (Cond.size() != 1)
return false;
@@ -3708,10 +3584,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 2:
if (X86::VK16RegClass.hasSubClassEq(RC))
return load ? X86::KMOVWkm : X86::KMOVWmk;
- if (X86::FR16XRegClass.hasSubClassEq(RC)) {
- assert(STI.hasFP16());
- return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
- }
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
return load ? X86::MOV16rm : X86::MOV16mr;
case 4:
@@ -3739,6 +3611,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
X86::VK16PAIRRegClass.hasSubClassEq(RC))
return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
+ if ((X86::FR16RegClass.hasSubClassEq(RC) ||
+ X86::FR16XRegClass.hasSubClassEq(RC)) &&
+ STI.hasFP16())
+ return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
@@ -3845,6 +3721,35 @@ X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
return AM;
}
+bool X86InstrInfo::verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const {
+ Optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
+ if (!AMOrNone)
+ return true;
+
+ ExtAddrMode AM = *AMOrNone;
+
+ if (AM.ScaledReg != X86::NoRegister) {
+ switch (AM.Scale) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
+ return false;
+ }
+ }
+ if (!isInt<32>(AM.Displacement)) {
+ ErrInfo = "Displacement in address must fit into 32-bit signed "
+ "integer";
+ return false;
+ }
+
+ return true;
+}
+
bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
const Register Reg,
int64_t &ImmVal) const {
@@ -3949,12 +3854,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILESTORED;
// tilestored %tmm, (%sp, %idx)
- MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
MachineInstr *NewMI =
@@ -3963,6 +3868,14 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
+ } else if ((RC->getID() == X86::FR16RegClassID ||
+ RC->getID() == X86::FR16XRegClassID) &&
+ !Subtarget.hasFP16()) {
+ unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr
+ : Subtarget.hasAVX() ? X86::VMOVSSmr
+ : X86::MOVSSmr;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
@@ -3991,6 +3904,14 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
+ } else if ((RC->getID() == X86::FR16RegClassID ||
+ RC->getID() == X86::FR16XRegClassID) &&
+ !Subtarget.hasFP16()) {
+ unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm
+ : Subtarget.hasAVX() ? X86::VMOVSSrm
+ : X86::MOVSSrm;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -4375,7 +4296,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
}
CmpInstr.setDesc(get(NewOpcode));
- CmpInstr.RemoveOperand(0);
+ CmpInstr.removeOperand(0);
// Mutating this instruction invalidates any debug data associated with it.
CmpInstr.dropDebugNumber();
// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
@@ -4423,6 +4344,23 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
MI = &Inst;
break;
}
+
+ // Look back for the following pattern, in which case the test64rr
+ // instruction could be erased.
+ //
+ // Example:
+ // %reg = and32ri %in_reg, 5
+ // ... // EFLAGS not changed.
+ // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
+ // test64rr %src_reg, %src_reg, implicit-def $eflags
+ MachineInstr *AndInstr = nullptr;
+ if (IsCmpZero &&
+ findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
+ NoSignFlag, ClearsOverflowFlag)) {
+ assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
+ MI = AndInstr;
+ break;
+ }
// Cannot find other candidates before definition of SrcReg.
return false;
}
@@ -4524,6 +4462,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
+ // If SF is used, but the instruction doesn't update the SF, then we
+ // can't do the optimization.
+ if (NoSignFlag)
+ return false;
+ LLVM_FALLTHROUGH;
case X86::COND_O: case X86::COND_NO:
// If OF is used, the instruction needs to clear it like CmpZero does.
if (!ClearsOverflowFlag)
@@ -4811,7 +4754,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
MIB->setDesc(TII.get(X86::POP32r));
}
- MIB->RemoveOperand(1);
+ MIB->removeOperand(1);
MIB->addImplicitDefUseOperands(*MBB.getParent());
// Build CFI if necessary.
@@ -4918,7 +4861,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
MIB->setDesc(Desc);
int64_t ShiftAmt = MIB->getOperand(2).getImm();
// Temporarily remove the immediate so we can add another source register.
- MIB->RemoveOperand(2);
+ MIB->removeOperand(2);
// Add the register. Don't copy the kill flag if there is one.
MIB.addReg(MIB.getReg(1),
getUndefRegState(MIB->getOperand(1).isUndef()));
@@ -4949,6 +4892,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
+ case X86::FsFLD0SH:
case X86::FsFLD0F128:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0: {
@@ -5026,7 +4970,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned MaskState = getRegState(MIB->getOperand(1));
unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
- MI.RemoveOperand(1);
+ MI.removeOperand(1);
MIB->setDesc(get(Opc));
// VPTERNLOG needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
@@ -5165,6 +5109,255 @@ static bool hasPartialRegUpdate(unsigned Opcode,
case X86::SQRTSDr_Int:
case X86::SQRTSDm_Int:
return true;
+ case X86::VFCMULCPHZ128rm:
+ case X86::VFCMULCPHZ128rmb:
+ case X86::VFCMULCPHZ128rmbkz:
+ case X86::VFCMULCPHZ128rmkz:
+ case X86::VFCMULCPHZ128rr:
+ case X86::VFCMULCPHZ128rrkz:
+ case X86::VFCMULCPHZ256rm:
+ case X86::VFCMULCPHZ256rmb:
+ case X86::VFCMULCPHZ256rmbkz:
+ case X86::VFCMULCPHZ256rmkz:
+ case X86::VFCMULCPHZ256rr:
+ case X86::VFCMULCPHZ256rrkz:
+ case X86::VFCMULCPHZrm:
+ case X86::VFCMULCPHZrmb:
+ case X86::VFCMULCPHZrmbkz:
+ case X86::VFCMULCPHZrmkz:
+ case X86::VFCMULCPHZrr:
+ case X86::VFCMULCPHZrrb:
+ case X86::VFCMULCPHZrrbkz:
+ case X86::VFCMULCPHZrrkz:
+ case X86::VFMULCPHZ128rm:
+ case X86::VFMULCPHZ128rmb:
+ case X86::VFMULCPHZ128rmbkz:
+ case X86::VFMULCPHZ128rmkz:
+ case X86::VFMULCPHZ128rr:
+ case X86::VFMULCPHZ128rrkz:
+ case X86::VFMULCPHZ256rm:
+ case X86::VFMULCPHZ256rmb:
+ case X86::VFMULCPHZ256rmbkz:
+ case X86::VFMULCPHZ256rmkz:
+ case X86::VFMULCPHZ256rr:
+ case X86::VFMULCPHZ256rrkz:
+ case X86::VFMULCPHZrm:
+ case X86::VFMULCPHZrmb:
+ case X86::VFMULCPHZrmbkz:
+ case X86::VFMULCPHZrmkz:
+ case X86::VFMULCPHZrr:
+ case X86::VFMULCPHZrrb:
+ case X86::VFMULCPHZrrbkz:
+ case X86::VFMULCPHZrrkz:
+ case X86::VFCMULCSHZrm:
+ case X86::VFCMULCSHZrmkz:
+ case X86::VFCMULCSHZrr:
+ case X86::VFCMULCSHZrrb:
+ case X86::VFCMULCSHZrrbkz:
+ case X86::VFCMULCSHZrrkz:
+ case X86::VFMULCSHZrm:
+ case X86::VFMULCSHZrmkz:
+ case X86::VFMULCSHZrr:
+ case X86::VFMULCSHZrrb:
+ case X86::VFMULCSHZrrbkz:
+ case X86::VFMULCSHZrrkz:
+ return Subtarget.hasMULCFalseDeps();
+ case X86::VPERMDYrm:
+ case X86::VPERMDYrr:
+ case X86::VPERMQYmi:
+ case X86::VPERMQYri:
+ case X86::VPERMPSYrm:
+ case X86::VPERMPSYrr:
+ case X86::VPERMPDYmi:
+ case X86::VPERMPDYri:
+ case X86::VPERMDZ256rm:
+ case X86::VPERMDZ256rmb:
+ case X86::VPERMDZ256rmbkz:
+ case X86::VPERMDZ256rmkz:
+ case X86::VPERMDZ256rr:
+ case X86::VPERMDZ256rrkz:
+ case X86::VPERMDZrm:
+ case X86::VPERMDZrmb:
+ case X86::VPERMDZrmbkz:
+ case X86::VPERMDZrmkz:
+ case X86::VPERMDZrr:
+ case X86::VPERMDZrrkz:
+ case X86::VPERMQZ256mbi:
+ case X86::VPERMQZ256mbikz:
+ case X86::VPERMQZ256mi:
+ case X86::VPERMQZ256mikz:
+ case X86::VPERMQZ256ri:
+ case X86::VPERMQZ256rikz:
+ case X86::VPERMQZ256rm:
+ case X86::VPERMQZ256rmb:
+ case X86::VPERMQZ256rmbkz:
+ case X86::VPERMQZ256rmkz:
+ case X86::VPERMQZ256rr:
+ case X86::VPERMQZ256rrkz:
+ case X86::VPERMQZmbi:
+ case X86::VPERMQZmbikz:
+ case X86::VPERMQZmi:
+ case X86::VPERMQZmikz:
+ case X86::VPERMQZri:
+ case X86::VPERMQZrikz:
+ case X86::VPERMQZrm:
+ case X86::VPERMQZrmb:
+ case X86::VPERMQZrmbkz:
+ case X86::VPERMQZrmkz:
+ case X86::VPERMQZrr:
+ case X86::VPERMQZrrkz:
+ case X86::VPERMPSZ256rm:
+ case X86::VPERMPSZ256rmb:
+ case X86::VPERMPSZ256rmbkz:
+ case X86::VPERMPSZ256rmkz:
+ case X86::VPERMPSZ256rr:
+ case X86::VPERMPSZ256rrkz:
+ case X86::VPERMPSZrm:
+ case X86::VPERMPSZrmb:
+ case X86::VPERMPSZrmbkz:
+ case X86::VPERMPSZrmkz:
+ case X86::VPERMPSZrr:
+ case X86::VPERMPSZrrkz:
+ case X86::VPERMPDZ256mbi:
+ case X86::VPERMPDZ256mbikz:
+ case X86::VPERMPDZ256mi:
+ case X86::VPERMPDZ256mikz:
+ case X86::VPERMPDZ256ri:
+ case X86::VPERMPDZ256rikz:
+ case X86::VPERMPDZ256rm:
+ case X86::VPERMPDZ256rmb:
+ case X86::VPERMPDZ256rmbkz:
+ case X86::VPERMPDZ256rmkz:
+ case X86::VPERMPDZ256rr:
+ case X86::VPERMPDZ256rrkz:
+ case X86::VPERMPDZmbi:
+ case X86::VPERMPDZmbikz:
+ case X86::VPERMPDZmi:
+ case X86::VPERMPDZmikz:
+ case X86::VPERMPDZri:
+ case X86::VPERMPDZrikz:
+ case X86::VPERMPDZrm:
+ case X86::VPERMPDZrmb:
+ case X86::VPERMPDZrmbkz:
+ case X86::VPERMPDZrmkz:
+ case X86::VPERMPDZrr:
+ case X86::VPERMPDZrrkz:
+ return Subtarget.hasPERMFalseDeps();
+ case X86::VRANGEPDZ128rmbi:
+ case X86::VRANGEPDZ128rmbikz:
+ case X86::VRANGEPDZ128rmi:
+ case X86::VRANGEPDZ128rmikz:
+ case X86::VRANGEPDZ128rri:
+ case X86::VRANGEPDZ128rrikz:
+ case X86::VRANGEPDZ256rmbi:
+ case X86::VRANGEPDZ256rmbikz:
+ case X86::VRANGEPDZ256rmi:
+ case X86::VRANGEPDZ256rmikz:
+ case X86::VRANGEPDZ256rri:
+ case X86::VRANGEPDZ256rrikz:
+ case X86::VRANGEPDZrmbi:
+ case X86::VRANGEPDZrmbikz:
+ case X86::VRANGEPDZrmi:
+ case X86::VRANGEPDZrmikz:
+ case X86::VRANGEPDZrri:
+ case X86::VRANGEPDZrrib:
+ case X86::VRANGEPDZrribkz:
+ case X86::VRANGEPDZrrikz:
+ case X86::VRANGEPSZ128rmbi:
+ case X86::VRANGEPSZ128rmbikz:
+ case X86::VRANGEPSZ128rmi:
+ case X86::VRANGEPSZ128rmikz:
+ case X86::VRANGEPSZ128rri:
+ case X86::VRANGEPSZ128rrikz:
+ case X86::VRANGEPSZ256rmbi:
+ case X86::VRANGEPSZ256rmbikz:
+ case X86::VRANGEPSZ256rmi:
+ case X86::VRANGEPSZ256rmikz:
+ case X86::VRANGEPSZ256rri:
+ case X86::VRANGEPSZ256rrikz:
+ case X86::VRANGEPSZrmbi:
+ case X86::VRANGEPSZrmbikz:
+ case X86::VRANGEPSZrmi:
+ case X86::VRANGEPSZrmikz:
+ case X86::VRANGEPSZrri:
+ case X86::VRANGEPSZrrib:
+ case X86::VRANGEPSZrribkz:
+ case X86::VRANGEPSZrrikz:
+ case X86::VRANGESDZrmi:
+ case X86::VRANGESDZrmikz:
+ case X86::VRANGESDZrri:
+ case X86::VRANGESDZrrib:
+ case X86::VRANGESDZrribkz:
+ case X86::VRANGESDZrrikz:
+ case X86::VRANGESSZrmi:
+ case X86::VRANGESSZrmikz:
+ case X86::VRANGESSZrri:
+ case X86::VRANGESSZrrib:
+ case X86::VRANGESSZrribkz:
+ case X86::VRANGESSZrrikz:
+ return Subtarget.hasRANGEFalseDeps();
+ case X86::VGETMANTSSZrmi:
+ case X86::VGETMANTSSZrmikz:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrib:
+ case X86::VGETMANTSSZrribkz:
+ case X86::VGETMANTSSZrrikz:
+ case X86::VGETMANTSDZrmi:
+ case X86::VGETMANTSDZrmikz:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrib:
+ case X86::VGETMANTSDZrribkz:
+ case X86::VGETMANTSDZrrikz:
+ case X86::VGETMANTSHZrmi:
+ case X86::VGETMANTSHZrmikz:
+ case X86::VGETMANTSHZrri:
+ case X86::VGETMANTSHZrrib:
+ case X86::VGETMANTSHZrribkz:
+ case X86::VGETMANTSHZrrikz:
+ case X86::VGETMANTPSZ128rmbi:
+ case X86::VGETMANTPSZ128rmbikz:
+ case X86::VGETMANTPSZ128rmi:
+ case X86::VGETMANTPSZ128rmikz:
+ case X86::VGETMANTPSZ256rmbi:
+ case X86::VGETMANTPSZ256rmbikz:
+ case X86::VGETMANTPSZ256rmi:
+ case X86::VGETMANTPSZ256rmikz:
+ case X86::VGETMANTPSZrmbi:
+ case X86::VGETMANTPSZrmbikz:
+ case X86::VGETMANTPSZrmi:
+ case X86::VGETMANTPSZrmikz:
+ case X86::VGETMANTPDZ128rmbi:
+ case X86::VGETMANTPDZ128rmbikz:
+ case X86::VGETMANTPDZ128rmi:
+ case X86::VGETMANTPDZ128rmikz:
+ case X86::VGETMANTPDZ256rmbi:
+ case X86::VGETMANTPDZ256rmbikz:
+ case X86::VGETMANTPDZ256rmi:
+ case X86::VGETMANTPDZ256rmikz:
+ case X86::VGETMANTPDZrmbi:
+ case X86::VGETMANTPDZrmbikz:
+ case X86::VGETMANTPDZrmi:
+ case X86::VGETMANTPDZrmikz:
+ return Subtarget.hasGETMANTFalseDeps();
+ case X86::VPMULLQZ128rm:
+ case X86::VPMULLQZ128rmb:
+ case X86::VPMULLQZ128rmbkz:
+ case X86::VPMULLQZ128rmkz:
+ case X86::VPMULLQZ128rr:
+ case X86::VPMULLQZ128rrkz:
+ case X86::VPMULLQZ256rm:
+ case X86::VPMULLQZ256rmb:
+ case X86::VPMULLQZ256rmbkz:
+ case X86::VPMULLQZ256rmkz:
+ case X86::VPMULLQZ256rr:
+ case X86::VPMULLQZ256rrkz:
+ case X86::VPMULLQZrm:
+ case X86::VPMULLQZrmb:
+ case X86::VPMULLQZrmbkz:
+ case X86::VPMULLQZrmkz:
+ case X86::VPMULLQZrr:
+ case X86::VPMULLQZrrkz:
+ return Subtarget.hasMULLQFalseDeps();
// GPR
case X86::POPCNT32rm:
case X86::POPCNT32rr:
@@ -5591,6 +5784,28 @@ void X86InstrInfo::breakPartialRegDependency(
.addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR128XRegClass.contains(Reg)) {
+ // Only handle VLX targets.
+ if (!Subtarget.hasVLX())
+ return;
+ // Since vxorps requires AVX512DQ, vpxord should be the best choice.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256XRegClass.contains(Reg) ||
+ X86::VR512RegClass.contains(Reg)) {
+ // Only handle VLX targets.
+ if (!Subtarget.hasVLX())
+ return;
+ // Use vpxord to clear the full ymm/zmm register.
+ // It wants to read and write the xmm sub-register.
+ Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
} else if (X86::GR64RegClass.contains(Reg)) {
// Using XOR32rr because it has shorter encoding and zeros up the upper bits
// as well.
@@ -6413,6 +6628,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_FsFLD0SS:
Alignment = Align(4);
break;
+ case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
Alignment = Align(2);
break;
@@ -6451,6 +6667,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
+ case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
@@ -6490,7 +6707,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Ty = Type::getDoubleTy(MF.getFunction().getContext());
else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
Ty = Type::getFP128Ty(MF.getFunction().getContext());
- else if (Opc == X86::AVX512_FsFLD0SH)
+ else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH)
Ty = Type::getHalfTy(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
@@ -7170,7 +7387,7 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// ENDBR instructions should not be scheduled around.
unsigned Opcode = MI.getOpcode();
if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
- Opcode == X86::LDTILECFG)
+ Opcode == X86::PLDTILECFGV)
return true;
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
@@ -9298,12 +9515,10 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
// We check to see if CFI Instructions are present, and if they are
// we find the number of CFI Instructions in the candidates.
unsigned CFICount = 0;
- MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
- for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
- Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
- if (MBBI->isCFIInstruction())
+ for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
+ std::next(RepeatedSequenceLocs[0].back()))) {
+ if (I.isCFIInstruction())
CFICount++;
- MBBI++;
}
// We compare the number of found CFI Instructions to the number of CFI
@@ -9440,7 +9655,7 @@ MachineBasicBlock::iterator
X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It,
MachineFunction &MF,
- const outliner::Candidate &C) const {
+ outliner::Candidate &C) const {
// Is it a tail call?
if (C.CallConstructionID == MachineOutlinerTailCall) {
// Yes, just insert a JMP.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 33ce55bbdb2b..4943d2152fd2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -40,13 +40,21 @@ std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
/// Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
-// Turn jCC instruction into condition code.
+/// Return the source operand # for condition code by \p MCID. If the
+/// instruction doesn't have a condition code, return -1.
+int getCondSrcNoFromDesc(const MCInstrDesc &MCID);
+
+/// Return the condition code of the instruction. If the instruction doesn't
+/// have a condition code, return X86::COND_INVALID.
+CondCode getCondFromMI(const MachineInstr &MI);
+
+// Turn JCC instruction into condition code.
CondCode getCondFromBranch(const MachineInstr &MI);
-// Turn setCC instruction into condition code.
+// Turn SETCC instruction into condition code.
CondCode getCondFromSETCC(const MachineInstr &MI);
-// Turn CMov instruction into condition code.
+// Turn CMOV instruction into condition code.
CondCode getCondFromCMov(const MachineInstr &MI);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
@@ -552,8 +560,10 @@ public:
MachineBasicBlock::iterator
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It, MachineFunction &MF,
- const outliner::Candidate &C) const override;
+ outliner::Candidate &C) const override;
+ bool verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const override;
#define GET_INSTRINFO_HELPER_DECLS
#include "X86GenInstrInfo.inc"
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index fee9939b8dfc..7f6ef3479d40 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -388,17 +388,19 @@ def X86AbsMemAsmOperand : AsmOperandClass {
}
class X86MemOperand<string printMethod,
- AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+ AsmOperandClass parserMatchClass = X86MemAsmOperand,
+ int size = 0> : Operand<iPTR> {
let PrintMethod = printMethod;
let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
let ParserMatchClass = parserMatchClass;
let OperandType = "OPERAND_MEMORY";
+ int Size = size;
}
// Gather mem operands
class X86VMemOperand<RegisterClass RC, string printMethod,
- AsmOperandClass parserMatchClass>
- : X86MemOperand<printMethod, parserMatchClass> {
+ AsmOperandClass parserMatchClass, int size = 0>
+ : X86MemOperand<printMethod, parserMatchClass, size> {
let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
}
@@ -413,48 +415,45 @@ def opaquemem : X86MemOperand<"printMemReference">;
def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>;
-def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
-def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
-def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
-def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
-def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
-def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
-def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
-def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
-def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
-def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
-def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>;
-def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
-def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
-def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand, 8>;
+def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand, 16>;
+def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand, 32>;
+def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64>;
+def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>;
+def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>;
+def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>;
+def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand, 16>;
+def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand, 32>;
+def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand, 64>;
+def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand, 80>;
+def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand, 128>;
+def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand, 256>;
+def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand, 512>;
// Gather mem operands
-def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>;
-def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>;
-def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>;
-def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>;
-def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>;
-
-def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>;
-def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>;
-def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>;
-def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>;
-def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>;
-def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>;
-def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>;
+def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand, 64>;
+def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand, 128>;
+def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand, 256>;
+def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand, 128>;
+def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand, 256>;
+
+def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand, 64>;
+def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand, 128>;
+def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand, 256>;
+def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand, 128>;
+def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand, 256>;
+def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand, 512>;
+def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand, 256>;
+def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand, 512>;
// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
// of a plain GPR, so that it doesn't potentially require a REX prefix.
def ptr_rc_norex : PointerLikeRegClass<2>;
def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
-def i8mem_NOREX : Operand<iPTR> {
- let PrintMethod = "printbytemem";
+def i8mem_NOREX : X86MemOperand<"printbytemem", X86Mem8AsmOperand, 8> {
let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
SEGMENT_REG);
- let ParserMatchClass = X86Mem8AsmOperand;
- let OperandType = "OPERAND_MEMORY";
}
// GPRs available for tailcall.
@@ -840,11 +839,11 @@ def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
// Define X86-specific addressing mode.
def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
- [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ [add, sub, mul, X86mul_imm, shl, or, xor, frameindex],
[]>;
// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
- [add, sub, mul, X86mul_imm, shl, or,
+ [add, sub, mul, X86mul_imm, shl, or, xor,
frameindex, X86WrapperRIP],
[]>;
@@ -855,7 +854,7 @@ def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
[tglobaltlsaddr], []>;
def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
- [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ [add, sub, mul, X86mul_imm, shl, or, xor, frameindex,
X86WrapperRIP], []>;
def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
@@ -875,12 +874,12 @@ def relocImm : ComplexPattern<iAny, 1, "selectRelocImm",
// X86 Instruction Predicate Definitions.
def TruePredicate : Predicate<"true">;
-def HasCMov : Predicate<"Subtarget->hasCMov()">;
-def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
+def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
def HasMMX : Predicate<"Subtarget->hasMMX()">;
-def Has3DNow : Predicate<"Subtarget->has3DNow()">;
-def Has3DNowA : Predicate<"Subtarget->has3DNowA()">;
+def Has3DNow : Predicate<"Subtarget->hasThreeDNow()">;
+def Has3DNowA : Predicate<"Subtarget->hasThreeDNowA()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
@@ -981,8 +980,8 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
-def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
-def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def HasCX8 : Predicate<"Subtarget->hasCX8()">;
+def HasCX16 : Predicate<"Subtarget->hasCX16()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
def HasKL : Predicate<"Subtarget->hasKL()">;
@@ -996,25 +995,25 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
def HasCRC32 : Predicate<"Subtarget->hasCRC32()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
- AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
+ AssemblerPredicate<(all_of (not Is64Bit)), "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
- AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">;
+ AssemblerPredicate<(all_of Is64Bit), "64-bit mode">;
def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
def In16BitMode : Predicate<"Subtarget->is16Bit()">,
- AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">;
+ AssemblerPredicate<(all_of Is16Bit), "16-bit mode">;
def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
- AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">;
+ AssemblerPredicate<(all_of (not Is16Bit)), "Not 16-bit mode">;
def In32BitMode : Predicate<"Subtarget->is32Bit()">,
- AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">;
+ AssemblerPredicate<(all_of Is32Bit), "32-bit mode">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
"Subtarget->getFrameLowering()->hasFP(*MF)"> {
let RecomputePerFunction = 1;
}
-def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
-def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
+def IsPS : Predicate<"Subtarget->isTargetPS()">;
+def NotPS : Predicate<"!Subtarget->isTargetPS()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
@@ -2229,13 +2228,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
- "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>;
+ "cmpxchg8b\t$dst", []>, TB, Requires<[HasCX8]>;
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
// NOTE: In64BitMode check needed for the AssemblerPredicate.
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", []>,
- TB, Requires<[HasCmpxchg16b,In64BitMode]>;
+ TB, Requires<[HasCX16,In64BitMode]>;
} // SchedRW, mayLoad, mayStore, hasSideEffects
@@ -2851,7 +2850,7 @@ let SchedRW = [WriteSystem] in {
def TPAUSE : I<0xAE, MRM6r,
(outs), (ins GR32orGR64:$src), "tpause\t$src",
[(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
- PD, Requires<[HasWAITPKG]>, NotMemoryFoldable;
+ PD, Requires<[HasWAITPKG]>;
}
} // SchedRW
@@ -2939,7 +2938,7 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
let SchedRW = [WriteSystem] in {
let Uses = [EAX, EDX] in
def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
- "invlpgb}", []>,
+ "invlpgb", []>,
PS, Requires<[Not64BitMode]>;
let Uses = [RAX, EDX] in
def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
@@ -3124,7 +3123,7 @@ def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
- [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable;
+ [(int_x86_clwb addr:$src)]>, PD;
let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index aeecc25ddea2..4196aff240c4 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -211,10 +211,10 @@ def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
(MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+def MMX_MOVD64from64mr : MMXRI<0x7E, MRMDestMem,
(outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}", []>,
- Sched<[SchedWriteVecMoveLS.MMX.MR]>;
+ Sched<[SchedWriteVecMoveLS.MMX.MR]>, NotMemoryFoldable;
let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in {
let canFoldAsLoad = 1 in
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 035f139e6f33..06cb280e860a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -112,6 +112,8 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
+ def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
+ [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
@@ -3471,9 +3473,9 @@ defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
-defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
SchedWriteVecIMul, 1, NoVLX>;
@@ -3965,6 +3967,20 @@ defm PINSRW : sse2_pinsrw, PD;
} // ExeDomain = SSEPackedInt
+// Always select FP16 instructions if available.
+let Predicates = [UseSSE2], AddedComplexity = -10 in {
+ def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
+ def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
+ def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
+ def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
+}
+
+let Predicates = [HasAVX, NoBWI] in {
+ def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
+ def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
+ def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE2 - Packed Mask Creation
//===---------------------------------------------------------------------===//
@@ -3997,7 +4013,10 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
//===---------------------------------------------------------------------===//
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
-let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+// As VEX does not have separate instruction contexts for address size
+// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
+// Prefer VMASKMODDQU64.
+let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
@@ -4008,32 +4027,16 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
(ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
- VEX, VEX_WIG, AdSize64;
-let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
-def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
- (ins VR128:$src, VR128:$mask), "",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
- VEX, VEX_WIG, AdSize32 {
- let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
- let AsmVariantName = "NonParsable";
-}
+ VEX, VEX_WIG;
-let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+let Uses = [EDI], Predicates = [UseSSE2] in
def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
"maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
- AdSize64;
-let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
-def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
- "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
- [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
- AdSize32 {
- let AsmVariantName = "NonParsable";
-}
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
} // ExeDomain = SSEPackedInt
@@ -5206,6 +5209,12 @@ let Predicates = [HasAVX, NoBWI] in
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+let Predicates = [UseSSE41] in
+ def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+
+let Predicates = [HasAVX, NoBWI] in
+ def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
+
/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
@@ -7588,6 +7597,21 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
(VPBROADCASTWYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR16:$src, sub_16bit))))>;
+
+ def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
+ (VPBROADCASTWYrm addr:$src)>;
+
+ def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
+ (VPBROADCASTWrr VR128:$src)>;
+ def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
+ (VPBROADCASTWYrr VR128:$src)>;
+
+ def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
+ (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
+ def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
+ (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
}
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index b4dd99d08a62..3a653a56e534 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -25,18 +25,18 @@ let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
def UD1Wm : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
- "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
def UD1Lm : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
- "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
def UD1Qm : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
- "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+ "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
def UD1Wr : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
- "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ "ud1{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
def UD1Lr : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
- "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ "ud1{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
def UD1Qr : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
- "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+ "ud1{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
}
let isTerminator = 1 in
@@ -71,9 +71,9 @@ def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
} // SchedRW
def : Pat<(debugtrap),
- (INT3)>, Requires<[NotPS4]>;
+ (INT3)>, Requires<[NotPS]>;
def : Pat<(debugtrap),
- (INT (i8 0x41))>, Requires<[IsPS4]>;
+ (INT (i8 0x41))>, Requires<[IsPS]>;
//===----------------------------------------------------------------------===//
// Input/Output Instructions.
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 28563eeb4484..7671eb4676ee 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -51,6 +51,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
// HLE prefixes
let SchedRW = [WriteSystem] in {
+// XACQUIRE and XRELEASE reuse REPNE and REP respectively.
+// For now, just prefer the REP versions.
let isAsmParserOnly = 1 in {
def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>;
def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>;
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index 2429aa113fb1..e6ecbb652100 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -17,6 +17,8 @@
let Predicates = [NoAVX512] in {
// A vector extract of the first f32/f64 position is a subregister copy
+ def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>;
def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
(COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
@@ -34,8 +36,8 @@ let Predicates = [HasAVX512] in {
}
let Predicates = [NoVLX] in {
- def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
- (COPY_TO_REGCLASS FR16X:$src, VR128)>;
+ def : Pat<(v8f16 (scalar_to_vector FR16:$src)),
+ (COPY_TO_REGCLASS FR16:$src, VR128)>;
// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
(COPY_TO_REGCLASS FR32:$src, VR128)>;
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index a5976b7d2d74..d89e481f4522 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -13,11 +13,11 @@
multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWriteVecALU.XMM]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
- Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
+ Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
}
let ExeDomain = SSEPackedInt in {
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 28d57ca9ae3c..ff701159b95e 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -21,7 +21,6 @@
#include "X86TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
@@ -31,6 +30,7 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -179,6 +179,8 @@ X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
return &X86::GR64RegClass;
}
if (RB.getID() == X86::VECRRegBankID) {
+ if (Ty.getSizeInBits() == 16)
+ return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
if (Ty.getSizeInBits() == 32)
return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
if (Ty.getSizeInBits() == 64)
@@ -516,7 +518,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
// is already on the instruction we're mutating, and thus we don't need to
// make any changes. So long as we select an opcode which is capable of
// loading or storing the appropriate size atomically, the rest of the
- // backend is required to respect the MMO state.
+ // backend is required to respect the MMO state.
if (!MemOp.isUnordered()) {
LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
return false;
@@ -537,12 +539,12 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
I.setDesc(TII.get(NewOpc));
MachineInstrBuilder MIB(MF, I);
if (Opc == TargetOpcode::G_LOAD) {
- I.RemoveOperand(1);
+ I.removeOperand(1);
addFullAddress(MIB, AM);
} else {
// G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
- I.RemoveOperand(1);
- I.RemoveOperand(0);
+ I.removeOperand(1);
+ I.removeOperand(0);
addFullAddress(MIB, AM).addUse(DefReg);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -625,7 +627,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
I.setDesc(TII.get(NewOpc));
MachineInstrBuilder MIB(MF, I);
- I.RemoveOperand(1);
+ I.removeOperand(1);
addFullAddress(MIB, AM);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -1412,7 +1414,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
- MF.getDataLayout().getPointerSize(), Alignment);
+ LLT::pointer(0, MF.getDataLayout().getPointerSizeInBits()), Alignment);
LoadInst =
addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 1edec96bbec3..3c8be95b43e3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -371,8 +371,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -818,8 +818,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+ X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0),
@@ -1281,8 +1281,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, ISD::AVGCEILU, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 4710e524931c..23976fb1a142 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -558,7 +558,7 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
}
// Find and eliminate gadget edges that have been mitigated.
- int MitigatedGadgets = 0, RemainingGadgets = 0;
+ int RemainingGadgets = 0;
NodeSet ReachableNodes{G};
for (const Node &RootN : G.nodes()) {
if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
@@ -586,7 +586,6 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
// This gadget's sink is reachable
++RemainingGadgets;
} else { // This gadget's sink is unreachable, and therefore mitigated
- ++MitigatedGadgets;
ElimEdges.insert(E);
}
}
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 6b564a0356a6..70964b352b8c 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 6206d8efb3d0..540182cb7911 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,6 +74,24 @@ static bool isAMXCast(Instruction *II) {
match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
}
+static bool isAMXIntrinsic(Value *I) {
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+ if (isAMXCast(II))
+ return false;
+ // Check if return type or parameter is x86_amx. If it is x86_amx
+ // the intrinsic must be x86 amx intrinsics.
+ if (II->getType()->isX86_AMXTy())
+ return true;
+ for (Value *V : II->args()) {
+ if (V->getType()->isX86_AMXTy())
+ return true;
+ }
+
+ return false;
+}
+
static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
Type *Ty) {
Function &F = *BB->getParent();
@@ -162,6 +180,36 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
return std::make_pair(Row, Col);
}
+static std::pair<Value *, Value *> getShape(PHINode *Phi) {
+ Use &U = *(Phi->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ User *V = U.getUser();
+ // TODO We don't traverse all users. To make the algorithm simple, here we
+ // just traverse the first user. If we can find shape, then return the shape,
+ // otherwise just return nullptr and the optimization for undef/zero will be
+ // abandoned.
+ while (V) {
+ if (isAMXCast(dyn_cast<Instruction>(V))) {
+ if (V->use_empty())
+ break;
+ Use &U = *(V->use_begin());
+ OpNo = U.getOperandNo();
+ V = U.getUser();
+ } else if (isAMXIntrinsic(V)) {
+ return getShape(cast<IntrinsicInst>(V), OpNo);
+ } else if (isa<PHINode>(V)) {
+ if (V->use_empty())
+ break;
+ Use &U = *(V->use_begin());
+ V = U.getUser();
+ } else {
+ break;
+ }
+ }
+
+ return std::make_pair(nullptr, nullptr);
+}
+
namespace {
class X86LowerAMXType {
Function &Func;
@@ -655,6 +703,9 @@ class X86LowerAMXCast {
public:
X86LowerAMXCast(Function &F) : Func(F) {}
+ void combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
+ void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
+ bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
bool combineAMXcast(TargetLibraryInfo *TLI);
bool transformAMXCast(IntrinsicInst *AMXCast);
bool transformAllAMXCast();
@@ -720,11 +771,33 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
OldPhiNodes.insert(PN);
while (!PhiWorklist.empty()) {
auto *OldPN = PhiWorklist.pop_back_val();
- for (Value *IncValue : OldPN->incoming_values()) {
+ for (unsigned I = 0; I < OldPN->getNumOperands(); ++I) {
+ Value *IncValue = OldPN->getIncomingValue(I);
// TODO: currently, We ignore cases where it is a const. In the future, we
// might support const.
- if (isa<Constant>(IncValue))
- return false;
+ if (isa<Constant>(IncValue)) {
+ auto *IncConst = dyn_cast<Constant>(IncValue);
+ if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
+ return false;
+ Value *Row = nullptr, *Col = nullptr;
+ std::tie(Row, Col) = getShape(OldPN);
+ // TODO: If it is not constant the Row and Col must domoniate tilezero
+ // that we are going to create.
+ if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
+ return false;
+ // Create tilezero at the end of incoming block.
+ auto *Block = OldPN->getIncomingBlock(I);
+ BasicBlock::iterator Iter = Block->getTerminator()->getIterator();
+ Instruction *NewInst = Builder.CreateIntrinsic(
+ Intrinsic::x86_tilezero_internal, None, {Row, Col});
+ NewInst->moveBefore(&*Iter);
+ NewInst = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+ {IncValue->getType()}, {NewInst});
+ NewInst->moveBefore(&*Iter);
+ // Replace InValue with new Value.
+ OldPN->setIncomingValue(I, NewInst);
+ IncValue = NewInst;
+ }
if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
if (OldPhiNodes.insert(PNode))
@@ -838,6 +911,99 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
return true;
}
+// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
+// store <256 x i32> %43, <256 x i32>* %p, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p,
+// i64 64, x86_amx %42)
+void X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
+ Value *Tile = Cast->getOperand(0);
+ // TODO: If it is cast intrinsic or phi node, we can propagate the
+ // shape information through def-use chain.
+ if (!isAMXIntrinsic(Tile))
+ return;
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ IRBuilder<> Builder(ST);
+ // Use the maximum column as stride. It must be the same with load
+ // stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+}
+
+// %65 = load <256 x i32>, <256 x i32>* %p, align 64
+// %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65)
+// -->
+// %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+// i8* %p, i64 64)
+void X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
+ Value *Row = nullptr, *Col = nullptr;
+ Use &U = *(Cast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = cast<IntrinsicInst>(U.getUser());
+ // TODO: If it is cast intrinsic or phi node, we can propagate the
+ // shape information through def-use chain.
+ if (!isAMXIntrinsic(II))
+ return;
+ std::tie(Row, Col) = getShape(II, OpNo);
+ IRBuilder<> Builder(LD);
+ // Use the maximun column as stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+ Value *NewInst =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ Cast->replaceAllUsesWith(NewInst);
+}
+
+bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
+ bool Change = false;
+ for (auto *Cast : Casts) {
+ auto *II = cast<IntrinsicInst>(Cast);
+ // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42)
+ // store <256 x i32> %43, <256 x i32>* %p, align 64
+ // -->
+ // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %p,
+ // i64 64, x86_amx %42)
+ if (II->getIntrinsicID() == Intrinsic::x86_cast_tile_to_vector) {
+ SmallVector<Instruction *, 2> DeadStores;
+ for (User *U : Cast->users()) {
+ StoreInst *Store = dyn_cast<StoreInst>(U);
+ if (!Store)
+ continue;
+ combineCastStore(cast<IntrinsicInst>(Cast), Store);
+ DeadStores.push_back(Store);
+ Change = true;
+ }
+ for (auto *Store : DeadStores)
+ Store->eraseFromParent();
+ } else { // x86_cast_vector_to_tile
+ SmallVector<Instruction *, 2> DeadLoads;
+ auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
+ if (!Load || !Load->hasOneUse())
+ continue;
+ // %65 = load <256 x i32>, <256 x i32>* %p, align 64
+ // %66 = call x86_amx @llvm.x86.cast.vector.to.tile(<256 x i32> %65)
+ // -->
+ // %66 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %p, i64 64)
+ combineLoadCast(cast<IntrinsicInst>(Cast), Load);
+ // Set the operand is null so that load instruction can be erased.
+ Cast->setOperand(0, nullptr);
+ Load->eraseFromParent();
+ }
+ }
+ return Change;
+}
+
bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
bool Change = false;
// Collect tile cast instruction.
@@ -879,17 +1045,22 @@ bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
+ SmallVector<Instruction *, 8> LiveCasts;
auto EraseInst = [&](SmallVectorImpl<Instruction *> &Insts) {
for (auto *Inst : Insts) {
if (Inst->use_empty()) {
Inst->eraseFromParent();
Change = true;
+ } else {
+ LiveCasts.push_back(Inst);
}
}
};
EraseInst(Vec2TileInsts);
EraseInst(Tile2VecInsts);
+ Change |= combineLdSt(LiveCasts);
+ EraseInst(LiveCasts);
// Handle the A->B->A cast, and there is an intervening PHI node.
for (BasicBlock &BB : Func) {
@@ -947,6 +1118,10 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
// i64 60)
// call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
// i8* %addr3, i64 60, x86_amx %2)
+ if (AMXCast->use_empty()) {
+ AMXCast->eraseFromParent();
+ return true;
+ }
Use &U = *(AMXCast->use_begin());
unsigned OpNo = U.getOperandNo();
auto *II = dyn_cast<IntrinsicInst>(U.getUser());
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 9044f10ec630..b107de692365 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -501,7 +501,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
for (const MachineOperand &MO : MI->operands())
if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
- OutMI.addOperand(MaybeMCOp.getValue());
+ OutMI.addOperand(*MaybeMCOp);
// Handle a few special cases to eliminate operand modifiers.
switch (OutMI.getOpcode()) {
@@ -962,6 +962,12 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
// These are not truly commutable so hide them from the default case.
break;
+ case X86::MASKMOVDQU:
+ case X86::VMASKMOVDQU:
+ if (AsmPrinter.getSubtarget().is64Bit())
+ OutMI.setFlags(X86::IP_HAS_AD_SIZE);
+ break;
+
default: {
// If the instruction is a commutable arithmetic instruction we might be
// able to commute the operands to get a 2 byte VEX prefix.
@@ -1311,7 +1317,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
E = FaultingMI.operands_end();
I != E; ++I)
if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
- MI.addOperand(MaybeOperand.getValue());
+ MI.addOperand(*MaybeOperand);
OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
OutStreamer->emitInstruction(MI, getSubtargetInfo());
@@ -1347,11 +1353,12 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
AccessInfo.CompileKernel, &ShadowBase,
&MappingScale, &OrShadowOffset);
- std::string Name = AccessInfo.IsWrite ? "store" : "load";
- std::string Op = OrShadowOffset ? "or" : "add";
- std::string SymName = "__asan_check_" + Name + "_" + Op + "_" +
- utostr(1ULL << AccessInfo.AccessSizeIndex) + "_" +
- TM.getMCRegisterInfo()->getName(Reg.asMCReg());
+ StringRef Name = AccessInfo.IsWrite ? "store" : "load";
+ StringRef Op = OrShadowOffset ? "or" : "add";
+ std::string SymName = ("__asan_check_" + Name + "_" + Op + "_" +
+ Twine(1ULL << AccessInfo.AccessSizeIndex) + "_" +
+ TM.getMCRegisterInfo()->getName(Reg.asMCReg()))
+ .str();
if (OrShadowOffset)
report_fatal_error(
"OrShadowOffset is not supported with optimized callbacks");
@@ -1375,7 +1382,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
MCI.setOpcode(Opcode);
for (auto &MO : drop_begin(MI.operands(), 2))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
- MCI.addOperand(MaybeOperand.getValue());
+ MCI.addOperand(*MaybeOperand);
SmallString<256> Code;
SmallVector<MCFixup, 4> Fixups;
@@ -1751,7 +1758,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
Ret.setOpcode(OpCode);
for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
- Ret.addOperand(MaybeOperand.getValue());
+ Ret.addOperand(*MaybeOperand);
OutStreamer->emitInstruction(Ret, getSubtargetInfo());
emitX86Nops(*OutStreamer, 10, Subtarget);
recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
@@ -1790,7 +1797,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
OutStreamer->AddComment("TAILCALL");
for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
- TC.addOperand(MaybeOperand.getValue());
+ TC.addOperand(*MaybeOperand);
OutStreamer->emitInstruction(TC, getSubtargetInfo());
}
@@ -1985,34 +1992,34 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
// Otherwise, use the .seh_ directives for all other Windows platforms.
switch (MI->getOpcode()) {
case X86::SEH_PushReg:
- OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
+ OutStreamer->emitWinCFIPushReg(MI->getOperand(0).getImm());
break;
case X86::SEH_SaveReg:
- OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
+ OutStreamer->emitWinCFISaveReg(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
case X86::SEH_SaveXMM:
- OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
+ OutStreamer->emitWinCFISaveXMM(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
case X86::SEH_StackAlloc:
- OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ OutStreamer->emitWinCFIAllocStack(MI->getOperand(0).getImm());
break;
case X86::SEH_SetFrame:
- OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+ OutStreamer->emitWinCFISetFrame(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
case X86::SEH_PushFrame:
- OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ OutStreamer->emitWinCFIPushFrame(MI->getOperand(0).getImm());
break;
case X86::SEH_EndPrologue:
- OutStreamer->EmitWinCFIEndProlog();
+ OutStreamer->emitWinCFIEndProlog();
break;
default:
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index 05f846bfb219..2e88e01ce7fd 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -13,6 +13,13 @@
using namespace llvm;
+MachineFunctionInfo *X86MachineFunctionInfo::clone(
+ BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const {
+ return DestMF.cloneInfo<X86MachineFunctionInfo>(*this);
+}
+
void X86MachineFunctionInfo::anchor() { }
void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 99d1a97380dd..99cc9f525b2c 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -119,7 +119,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
Optional<int> SwiftAsyncContextFrameIdx;
- ValueMap<const Value *, size_t> PreallocatedIds;
+ // Preallocated fields are only used during isel.
+ // FIXME: Can we find somewhere else to store these?
+ DenseMap<const Value *, size_t> PreallocatedIds;
SmallVector<size_t, 0> PreallocatedStackSizes;
SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
@@ -132,6 +134,12 @@ public:
X86MachineFunctionInfo() = default;
explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+ explicit X86MachineFunctionInfo(const X86MachineFunctionInfo &) = default;
+
+ MachineFunctionInfo *
+ clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF,
+ const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
+ const override;
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index 425054cfdd92..aa6e8645e092 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index e92b1b002bb0..bb59cee8badb 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -37,21 +37,20 @@ STATISTIC(NumBBsPadded, "Number of basic blocks padded");
namespace {
struct VisitedBBInfo {
// HasReturn - Whether the BB contains a return instruction
- bool HasReturn;
+ bool HasReturn = false;
// Cycles - Number of cycles until return if HasReturn is true, otherwise
// number of cycles until end of the BB
- unsigned int Cycles;
+ unsigned int Cycles = 0;
- VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo() = default;
VisitedBBInfo(bool HasReturn, unsigned int Cycles)
: HasReturn(HasReturn), Cycles(Cycles) {}
};
struct PadShortFunc : public MachineFunctionPass {
static char ID;
- PadShortFunc() : MachineFunctionPass(ID)
- , Threshold(4) {}
+ PadShortFunc() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -82,7 +81,7 @@ namespace {
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd);
- const unsigned int Threshold;
+ const unsigned int Threshold = 4;
// ReturnBBs - Maps basic blocks that return to the minimum number of
// cycles until the return, starting from the entry block.
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 4342ac089cae..7761f7323358 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -19,8 +19,10 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -220,16 +222,21 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
return false;
- // Operand should be a select.
- auto *SI = dyn_cast<SelectInst>(Op);
- if (!SI)
- return false;
-
- // Select needs to implement absolute value.
- Value *LHS, *RHS;
- auto SPR = matchSelectPattern(SI, LHS, RHS);
- if (SPR.Flavor != SPF_ABS)
- return false;
+ Value *LHS;
+ if (match(Op, PatternMatch::m_Intrinsic<Intrinsic::abs>())) {
+ LHS = Op->getOperand(0);
+ } else {
+ // Operand should be a select.
+ auto *SI = dyn_cast<SelectInst>(Op);
+ if (!SI)
+ return false;
+
+ Value *RHS;
+ // Select needs to implement absolute value.
+ auto SPR = matchSelectPattern(SI, LHS, RHS);
+ if (SPR.Flavor != SPF_ABS)
+ return false;
+ }
// Need a subtract of two values.
auto *Sub = dyn_cast<BinaryOperator>(LHS);
@@ -253,7 +260,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
if (!Op0 || !Op1)
return false;
- IRBuilder<> Builder(SI);
+ IRBuilder<> Builder(Op);
auto *OpTy = cast<FixedVectorType>(Op->getType());
unsigned NumElts = OpTy->getNumElements();
@@ -271,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
IntrinsicNumElts = 16;
}
- Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
+ Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID);
if (NumElts < 16) {
// Pad input with zeroes.
@@ -336,8 +343,8 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
}
- SI->replaceAllUsesWith(Ops[0]);
- SI->eraseFromParent();
+ Op->replaceAllUsesWith(Ops[0]);
+ Op->eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
index d9c6d08ada73..cd0d448238a6 100644
--- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreAMXConfig.cpp
@@ -91,16 +91,17 @@ static bool brokenVolatile(Instruction *I) {
namespace {
class X86PreAMXConfig {
+ using PosAndShapesMap = MapVector<Instruction *, SmallVector<Value *, 8>>;
+
Function &F;
public:
X86PreAMXConfig(Function &Func) : F(Func) {}
bool preTileConfig();
- bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
- bool findConfigShapes(
- DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes);
+ void addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
+ bool findConfigShapes(PosAndShapesMap &PosAndShapes);
bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
- bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+ void preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
SmallVector<Value *, 8> &Shapes);
BasicBlock::iterator
getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
@@ -149,10 +150,9 @@ public:
// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
// call void @llvm.x86.tilestored64.internal(... td) area
// --------------------------------------------------------------------------
-bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
+void X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
SmallVector<Value *, 8> &Shapes) {
- bool Write = false;
- LLVMContext &Ctx = Pos->getParent()->getContext();
+ LLVMContext &Ctx = Builder.getContext();
Type *I8Ty = Type::getInt8Ty(Ctx);
Type *I16Ty = Type::getInt16Ty(Ctx);
@@ -160,30 +160,27 @@ bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
// other value in the future.
Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
- Value *PalettePos =
- GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
- new StoreInst(PaletteValue, PalettePos, Pos);
+ Value *PalettePos = Builder.CreateGEP(I8Ty, I8Ptr, PaletteOffset);
+ Builder.CreateStore(PaletteValue, PalettePos);
for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
const std::string ShapeName = "amx.tmm." + itostr(I);
- Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
- ShapeName + ".shape.row", Pos);
- Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
- ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
- ShapeName + ".shape.col", Pos);
+ Value *RowPos = Builder.CreateGEP(I8Ty, I8Ptr, RowOffset,
+ ShapeName + ".shape.row");
+ Value *ColPos = Builder.CreateGEP(I8Ty, I8Ptr, ColOffset);
+ ColPos = Builder.CreateBitCast(ColPos, PointerType::get(I16Ty, 0),
+ ShapeName + ".shape.col");
Value *Row = Shapes[I * 2];
Value *Col = Shapes[I * 2 + 1];
- Row = new TruncInst(Row, I8Ty, "", Pos);
- new StoreInst(Row, RowPos, Pos);
- new StoreInst(Col, ColPos, Pos);
- Write = true;
+ Row = Builder.CreateTrunc(Row, I8Ty);
+ Builder.CreateStore(Row, RowPos);
+ Builder.CreateStore(Col, ColPos);
}
- return Write;
}
-bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
+void X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
SmallVector<Value *, 8> &Shapes) {
Module *M = F.getParent();
IRBuilder<> Builder(ModelStart);
@@ -198,17 +195,11 @@ bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
Addr->setAlignment(Alignment);
Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
- std::array<Value *, 1> Args = {I8Ptr};
- Instruction *Cfg =
- Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);
-
- Value *Val0 = Constant::getNullValue(V512Ty);
- Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
- assert(Init0 && "Not Zero initilizate the cfg mem!");
+ Builder.CreateAlignedStore(Constant::getNullValue(V512Ty), Addr, Alignment);
- preWriteTileCfg(I8Ptr, Cfg, Shapes);
+ preWriteTileCfg(I8Ptr, Builder, Shapes);
- return Init0;
+ Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, {I8Ptr});
}
// Todo: We may need to handle "more than one store" case in the future.
@@ -315,8 +306,7 @@ X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n)
// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n)
// --------------------------------------------------------------------------
-bool X86PreAMXConfig::findConfigShapes(
- DenseMap<Instruction *, SmallVector<Value *, 8>> &PosAndShapes) {
+bool X86PreAMXConfig::findConfigShapes(PosAndShapesMap &PosAndShapes) {
bool Find = false;
for (BasicBlock &BB : F) {
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
@@ -365,7 +355,7 @@ bool X86PreAMXConfig::findConfigShapes(
// call void @llvm.x86.tilestored64.internal(... td) area
// --------------------------------------------------------------------------
bool X86PreAMXConfig::preTileConfig() {
- DenseMap<Instruction *, SmallVector<Value *, 8>> PosAndShapes;
+ PosAndShapesMap PosAndShapes;
bool NeedCfg = findConfigShapes(PosAndShapes);
if (!NeedCfg)
return false;
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 5d21f8666ec6..479db8585ca0 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -40,10 +41,15 @@
using namespace llvm;
#define DEBUG_TYPE "tile-pre-config"
-#define REPORT_CONFIG_FAIL \
- report_fatal_error( \
- MF.getName() + \
- ": Failed to config tile register, please define the shape earlier");
+
+static void emitErrorMsg(MachineFunction &MF) {
+ SmallString<32> Str;
+ Twine ErrorMsg =
+ MF.getName() +
+ ": Failed to config tile register, please define the shape earlier";
+ LLVMContext &Context = MF.getMMI().getModule()->getContext();
+ Context.emitError(ErrorMsg);
+}
namespace {
@@ -302,12 +308,19 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineBasicBlock *, 8> WorkList;
for (auto &I : ShapeBBs) {
// TODO: We can hoist shapes across BBs here.
- if (BBVisitedInfo[I.first].HasAMXRegLiveIn)
- REPORT_CONFIG_FAIL
+ if (BBVisitedInfo[I.first].HasAMXRegLiveIn) {
+ // We are not able to config tile registers since the shape to config
+ // is not defined yet. Emit error message and continue. The function
+ // would not config tile registers.
+ emitErrorMsg(MF);
+ return false;
+ }
if (BBVisitedInfo[I.first].FirstAMX &&
BBVisitedInfo[I.first].FirstAMX < I.second.back() &&
- !hoistShapesInBB(I.first, I.second))
- REPORT_CONFIG_FAIL
+ !hoistShapesInBB(I.first, I.second)) {
+ emitErrorMsg(MF);
+ return false;
+ }
WorkList.push_back(I.first);
}
while (!WorkList.empty()) {
@@ -356,7 +369,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
// multi insert.
if (VisitedOrInserted.insert(I).second) {
auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
- addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)),
+ addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::PLDTILECFGV)),
SS);
}
}
@@ -367,33 +380,27 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *MI = &*MBB.begin();
if (ST.hasAVX512()) {
Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
- BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm)
- .addReg(Zmm, RegState::Undef)
- .addReg(Zmm, RegState::Undef);
+ BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
.addReg(Zmm);
} else if (ST.hasAVX2()) {
Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
- BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm)
- .addReg(Ymm, RegState::Undef)
- .addReg(Ymm, RegState::Undef);
+ BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
.addReg(Ymm);
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
.addReg(Ymm);
} else {
assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
+ unsigned StoreOpc = ST.hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
- BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm)
- .addReg(Xmm, RegState::Undef)
- .addReg(Xmm, RegState::Undef);
- addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS)
- .addReg(Xmm);
- addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16)
+ BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS).addReg(Xmm);
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 16)
.addReg(Xmm);
- addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32)
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 32)
.addReg(Xmm);
- addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48)
+ addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 48)
.addReg(Xmm);
}
// Fill in the palette first.
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
index 9c076d2d6769..c49fc458eab3 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -12,9 +12,9 @@
#include "X86RegisterBankInfo.h"
#include "X86InstrInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#define GET_TARGET_REGBANK_IMPL
@@ -25,8 +25,7 @@ using namespace llvm;
#define GET_TARGET_REGBANK_INFO_IMPL
#include "X86GenRegisterBankInfo.def"
-X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
- : X86GenRegisterBankInfo() {
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) {
// validate RegBank initialization.
const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.h b/llvm/lib/Target/X86/X86RegisterBankInfo.h
index d5afd2cae761..fca36a317b58 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
#define GET_REGBANK_DECLARATIONS
#include "X86GenRegisterBank.inc"
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 130cb61cdde2..f2658f70434b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -26,6 +26,8 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
@@ -618,6 +620,66 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool X86RegisterInfo::isArgumentRegister(const MachineFunction &MF,
+ MCRegister Reg) const {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const TargetRegisterInfo &TRI = *ST.getRegisterInfo();
+ auto IsSubReg = [&](MCRegister RegA, MCRegister RegB) {
+ return TRI.isSuperOrSubRegisterEq(RegA, RegB);
+ };
+
+ if (!ST.is64Bit())
+ return llvm::any_of(
+ SmallVector<MCRegister>{X86::EAX, X86::ECX, X86::EDX},
+ [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }) ||
+ (ST.hasMMX() && X86::VR64RegClass.contains(Reg));
+
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
+
+ if (CC == CallingConv::X86_64_SysV && IsSubReg(X86::RAX, Reg))
+ return true;
+
+ if (llvm::any_of(
+ SmallVector<MCRegister>{X86::RDX, X86::RCX, X86::R8, X86::R9},
+ [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+ return true;
+
+ if (CC != CallingConv::Win64 &&
+ llvm::any_of(SmallVector<MCRegister>{X86::RDI, X86::RSI},
+ [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+ return true;
+
+ if (ST.hasSSE1() &&
+ llvm::any_of(SmallVector<MCRegister>{X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5,
+ X86::XMM6, X86::XMM7},
+ [&](MCRegister &RegA) { return IsSubReg(RegA, Reg); }))
+ return true;
+
+ return X86GenRegisterInfo::isArgumentRegister(MF, Reg);
+}
+
+bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
+ MCRegister PhysReg) const {
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const TargetRegisterInfo &TRI = *ST.getRegisterInfo();
+
+ // Stack pointer.
+ if (TRI.isSuperOrSubRegisterEq(X86::RSP, PhysReg))
+ return true;
+
+ // Don't use the frame pointer if it's being used.
+ const X86FrameLowering &TFI = *getFrameLowering(MF);
+ if (TFI.hasFP(MF) && TRI.isSuperOrSubRegisterEq(X86::RBP, PhysReg))
+ return true;
+
+ return X86GenRegisterInfo::isFixedRegister(MF, PhysReg);
+}
+
+bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
+ return RC->getID() == X86::TILERegClassID;
+}
+
void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
// Check if the EFLAGS register is marked as live-out. This shouldn't happen,
// because the calling convention defines the EFLAGS register as NOT
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 7fd10ddd1a15..6f4fb405d29f 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -115,6 +115,18 @@ public:
/// register scavenger to determine what registers are free.
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ /// isArgumentReg - Returns true if Reg can be used as an argument to a
+ /// function.
+ bool isArgumentRegister(const MachineFunction &MF,
+ MCRegister Reg) const override;
+
+ /// Return true if it is tile register class.
+ bool isTileRegisterClass(const TargetRegisterClass *RC) const;
+
+ /// Returns true if PhysReg is a fixed register.
+ bool isFixedRegister(const MachineFunction &MF,
+ MCRegister PhysReg) const override;
+
void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
bool hasBasePointer(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 1b704bcb8e08..6dc51e37d3c2 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -537,6 +537,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;}
+
// FIXME: This sets up the floating point register files as though they are f64
// values, though they really are f80 values. This will cause us to spill
@@ -599,7 +601,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
-def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
+def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
// Extended VR128 and VR256 for AVX-512 instructions
def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
@@ -638,3 +640,14 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
+
+//===----------------------------------------------------------------------===//
+// Register categories.
+//
+
+// The TILE and VK*PAIR registers may not be "fixed", but we don't want them
+// anyway.
+def FixedRegisters : RegisterCategory<[DEBUG_REG, CONTROL_REG, CCR, FPCCR,
+ DFCCR, TILE, VK1PAIR, VK2PAIR, VK4PAIR,
+ VK8PAIR, VK16PAIR]>;
+def GeneralPurposeRegisters : RegisterCategory<[GR64, GR32, GR16, GR8]>;
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 8e317dc22bd6..e4b95cb0807f 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -814,12 +814,26 @@ def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
def: InstRW<[BWWriteResGroup34], (instregex "CLD")>;
def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r(1|i)",
- "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[BWWriteResGroup35], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def BWWriteResGroup36 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[BWWriteResGroup36], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def BWWriteResGroup36b : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[BWWriteResGroup36b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 1cd0b3379684..7b1a31d2a4df 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1299,12 +1299,26 @@ def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;
def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r(1|i)",
- "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[HWWriteResGroup59], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def HWWriteResGroup60 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[HWWriteResGroup60], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def HWWriteResGroup60b : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[HWWriteResGroup60b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
let Latency = 4;
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 9fd986e34181..b66db7e7e73a 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -923,12 +923,26 @@ def ICXWriteResGroup43 : SchedWriteRes<[ICXPort237,ICXPort0156]> {
def: InstRW<[ICXWriteResGroup43], (instrs MFENCE)>;
def ICXWriteResGroup44 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[ICXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
- "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[ICXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def ICXWriteResGroup44b : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[ICXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def ICXWriteResGroup44c : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[ICXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def ICXWriteResGroup45 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237]> {
let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7e619a3a8722..49858ca0a800 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -111,8 +111,17 @@ def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; }
def : WriteRes<WriteMove, [SBPort015]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;
-def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 5; let NumMicroOps = 0; }
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
// Arithmetic.
defm : SBWriteResPair<WriteALU, [SBPort015], 1>;
@@ -678,13 +687,27 @@ def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
}
def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;
-def SBWriteResGroup23 : SchedWriteRes<[SBPort05]> {
+def SBWriteResGroup23 : SchedWriteRes<[SBPort05,SBPort015]> {
let Latency = 2;
let NumMicroOps = 3;
- let ResourceCycles = [3];
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup23], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SBWriteResGroup24 : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
+ let Latency = 3;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,4,2];
+}
+def: InstRW<[SBWriteResGroup24], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SBWriteResGroup24b : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,4,2];
}
-def: InstRW<[SBWriteResGroup23], (instregex "RCL(8|16|32|64)r1",
- "RCR(8|16|32|64)r1")>;
+def: InstRW<[SBWriteResGroup24b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 7;
@@ -727,8 +750,8 @@ def SBWriteResGroup76 : SchedWriteRes<[SBPort05]> {
let NumMicroOps = 8;
let ResourceCycles = [8];
}
-def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)r(i|CL)",
- "RCR(8|16|32|64)r(i|CL)")>;
+def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)rCL",
+ "RCR(8|16|32|64)rCL")>;
def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
let Latency = 5;
@@ -802,8 +825,7 @@ def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
let NumMicroOps = 1;
let ResourceCycles = [1];
}
-def: InstRW<[SBWriteResGroup48], (instrs MMX_MOVD64from64rm,
- VBROADCASTSSrm)>;
+def: InstRW<[SBWriteResGroup48], (instrs VBROADCASTSSrm)>;
def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r",
"(V?)MOV64toPQIrm",
"(V?)MOVDDUPrm",
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 0a88bac5aa66..05364e3434e4 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -836,12 +836,26 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;
def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r(1|i)",
- "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[SKLWriteResGroup42], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SKLWriteResGroup42b : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKLWriteResGroup42b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SKLWriteResGroup42c : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKLWriteResGroup42c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
let Latency = 3;
@@ -921,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
- "MOVZX(16|32|64)rm(8|16)",
- "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67?
+ "MOVZX(16|32|64)rm(8|16)")>;
def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
let Latency = 5;
@@ -979,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm,
VPBROADCASTDrm,
VPBROADCASTQrm)>;
def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm",
- "(V?)MOVSLDUPrm")>;
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVDDUPrm")>;
def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index b28a18f0dcd7..b682b51c298a 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -905,12 +905,26 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;
def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
- let Latency = 3;
+ let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
-def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
- "RCR(8|16|32|64)r(1|i)")>;
+def: InstRW<[SKXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def SKXWriteResGroup44b : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def SKXWriteResGroup44c : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,2];
+}
+def: InstRW<[SKXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
let Latency = 3;
@@ -1041,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
let ResourceCycles = [1];
}
def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
- "MOVZX(16|32|64)rm(8|16)",
- "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71?
+ "MOVZX(16|32|64)rm(8|16)")>;
def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
let Latency = 5;
@@ -1145,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm,
VPBROADCASTDrm,
- VPBROADCASTQrm,
- VMOVSHDUPrm,
- VMOVSLDUPrm,
- MOVSHDUPrm,
- MOVSLDUPrm)>;
+ VPBROADCASTQrm)>;
+def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm",
+ "(V?)MOVSLDUPrm",
+ "(V?)MOVDDUPrm")>;
def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
let Latency = 6;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 4b2fa87a25b5..1e9fcf6cc8cf 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -840,8 +840,8 @@ def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JAL
let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
let NumMicroOps = 63;
}
-def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
- VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+ VMASKMOVDQU, VMASKMOVDQU64)>;
///////////////////////////////////////////////////////////////////////////////
// SchedWriteVariant definitions.
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 52605c031617..de4e7dd3cb90 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -377,10 +377,8 @@ defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2]>;
defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
-// FIXME: The below is closer to correct, but caused some perf regressions.
-//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
-defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 4>;
-defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
defm : X86WriteResPairUnsupported<WritePMULLDZ>;
defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index fe0484afd227..aada3e0bd906 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -189,15 +189,6 @@ defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
defm : ZnWriteResPair<WriteADC, [ZnALU], 1>;
defm : ZnWriteResPair<WriteIMul8, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16Imm, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul16Reg, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32Imm, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul32Reg, [ZnALU1, ZnMultiplier], 4>;
-//defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
-//defm : ZnWriteResPair<WriteIMul64Imm, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
-//defm : ZnWriteResPair<WriteIMul64Reg, [ZnALU1, ZnMultiplier], 4, [1,1], 2>;
defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>;
defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>;
@@ -227,12 +218,10 @@ defm : X86WriteRes<WriteBitTest, [ZnALU], 1, [1], 1>;
defm : X86WriteRes<WriteBitTestImmLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
defm : X86WriteRes<WriteBitTestRegLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
defm : X86WriteRes<WriteBitTestSet, [ZnALU], 2, [1], 2>;
-//defm : X86WriteRes<WriteBitTestSetImmLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
-//defm : X86WriteRes<WriteBitTestSetRegLd, [ZnALU,ZnAGU], 5, [1,1], 2>;
// Bit counts.
-defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;
-defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>;
+defm : ZnWriteResPair<WriteBSF, [ZnALU], 3, [12], 6, 4, 2>;
+defm : ZnWriteResPair<WriteBSR, [ZnALU], 4, [16], 6, 4, 2>;
defm : ZnWriteResPair<WriteLZCNT, [ZnALU], 2>;
defm : ZnWriteResPair<WriteTZCNT, [ZnALU], 2>;
defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>;
@@ -240,9 +229,8 @@ defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
-// BMI1 BEXTR/BLS, BMI2 BZHI
+// BMI1 BEXTR, BMI2 BZHI
defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
-//defm : ZnWriteResPair<WriteBLS, [ZnALU], 2>;
defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>;
// IDIV
@@ -271,13 +259,13 @@ defm : X86WriteRes<WriteFLoadX, [ZnAGU], 8, [1], 1>;
defm : X86WriteRes<WriteFLoadY, [ZnAGU], 8, [1], 1>;
defm : X86WriteRes<WriteFMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedLoadY, [ZnAGU,ZnFPU01], 8, [1,2], 2>;
+
defm : X86WriteRes<WriteFStore, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
-
defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
@@ -288,24 +276,24 @@ defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;
defm : X86WriteResUnsupported<WriteFMoveZ>;
-defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFAddY, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAddY, [ZnFPU23], 3, [2], 2>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
-defm : ZnWriteResFpuPair<WriteFAdd64, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFAdd64X, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFAdd64Y, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64, [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64X, [ZnFPU23], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd64Y, [ZnFPU23], 3, [2], 2>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : ZnWriteResFpuPair<WriteFCmp, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFCmpX, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFCmpY, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmpX, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmpY, [ZnFPU01], 1, [2], 2>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : ZnWriteResFpuPair<WriteFCmp64, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCmp64, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU01], 1, [2], 2>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>;
-defm : ZnWriteResFpuPair<WriteFComX, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU01,ZnFPU2], 3, [1,1], 2>;
+defm : ZnWriteResFpuPair<WriteFComX, [ZnFPU01,ZnFPU2], 3, [1,1], 2>;
defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -346,8 +334,8 @@ defm : X86WriteResPairUnsupported<WriteFRndZ>;
defm : ZnWriteResFpuPair<WriteFLogic, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteFLogicY, [ZnFPU], 1>;
defm : X86WriteResPairUnsupported<WriteFLogicZ>;
-defm : ZnWriteResFpuPair<WriteFTest, [ZnFPU], 1>;
-defm : ZnWriteResFpuPair<WriteFTestY, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteFTest, [ZnFPU12], 2, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteFTestY, [ZnFPU12], 4, [4], 3, 7, 2>;
defm : X86WriteResPairUnsupported<WriteFTestZ>;
defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>;
defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>;
@@ -410,20 +398,23 @@ defm : X86WriteRes<WriteVecMoveToGpr, [ZnFPU2], 2, [1], 1>;
defm : X86WriteRes<WriteVecMoveFromGpr, [ZnFPU2], 3, [1], 1>;
defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>;
-defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU2], 1>;
defm : ZnWriteResFpuPair<WriteVecShiftX, [ZnFPU2], 1>;
-defm : ZnWriteResFpuPair<WriteVecShiftY, [ZnFPU2], 2>;
+defm : ZnWriteResFpuPair<WriteVecShiftY, [ZnFPU2], 1, [2], 2>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU], 1>;
-defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>;
-defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU2], 1>;
+defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU2], 1, [2], 2>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU1], 3, [2], 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU1], 3, [4], 2>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecLogicX, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecLogicY, [ZnFPU], 1>;
defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
-defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 1, [2], 1, 7, 1>;
-defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 1, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 2, [2], 1, 7, 1>;
+defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 4, [4], 3, 7, 2>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecALUX, [ZnFPU], 1>;
@@ -448,7 +439,7 @@ defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>;
defm : X86WriteResPairUnsupported<WriteBlendZ>;
defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
-defm : ZnWriteResFpuPair<WriteVPMOV256, [ZnFPU12], 1, [1], 2>;
+defm : ZnWriteResFpuPair<WriteVPMOV256, [ZnFPU12], 1, [4], 3>;
defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>;
defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>;
defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>;
@@ -456,11 +447,6 @@ defm : ZnWriteResFpuPair<WritePSADBWY, [ZnFPU0], 3>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : ZnWriteResFpuPair<WritePHMINPOS, [ZnFPU0], 4>;
-// Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>;
-defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
-
// Vector insert/extract operations.
defm : ZnWriteResFpuPair<WriteVecInsert, [ZnFPU], 1>;
@@ -623,15 +609,14 @@ def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
def : SchedAlias<WriteIMul16, ZnWriteMul16>;
def : SchedAlias<WriteIMul16Imm, ZnWriteMul16>; // TODO: is this right?
def : SchedAlias<WriteIMul16Reg, ZnWriteMul16>; // TODO: is this right?
-def : SchedAlias<WriteIMul16ImmLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul16RegLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
// m16.
def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
def : SchedAlias<WriteIMul16Ld, ZnWriteMul16Ld>;
-
+def : SchedAlias<WriteIMul16ImmLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul16RegLd, ZnWriteMul16>; // TODO: this is definitely wrong but matches what the instregex did.
// r32.
def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
let Latency = 3;
@@ -639,14 +624,14 @@ def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
def : SchedAlias<WriteIMul32, ZnWriteMul32>;
def : SchedAlias<WriteIMul32Imm, ZnWriteMul32>; // TODO: is this right?
def : SchedAlias<WriteIMul32Reg, ZnWriteMul32>; // TODO: is this right?
-def : SchedAlias<WriteIMul32ImmLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul32RegLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
// m32.
def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
def : SchedAlias<WriteIMul32Ld, ZnWriteMul32Ld>;
+def : SchedAlias<WriteIMul32ImmLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul32RegLd, ZnWriteMul32>; // TODO: this is definitely wrong but matches what the instregex did.
// r64.
def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
@@ -656,8 +641,6 @@ def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
def : SchedAlias<WriteIMul64, ZnWriteMul64>;
def : SchedAlias<WriteIMul64Imm, ZnWriteMul64>; // TODO: is this right?
def : SchedAlias<WriteIMul64Reg, ZnWriteMul64>; // TODO: is this right?
-def : SchedAlias<WriteIMul64ImmLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
-def : SchedAlias<WriteIMul64RegLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
// m64.
def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
@@ -665,6 +648,8 @@ def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
let NumMicroOps = 2;
}
def : SchedAlias<WriteIMul64Ld, ZnWriteMul64Ld>;
+def : SchedAlias<WriteIMul64ImmLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
+def : SchedAlias<WriteIMul64RegLd, ZnWriteMul64>; // TODO: this is definitely wrong but matches what the instregex did.
// MULX
// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
@@ -1101,12 +1086,11 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
// HADD, HSUB PS/PD
// PHADD|PHSUB (S) W/D.
-def : SchedAlias<WritePHAdd, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddLd, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddX, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddY, ZnWriteMicrocoded>;
-def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>;
+defm : ZnWriteResFpuPair<WriteFHAdd, [], 7>;
+defm : ZnWriteResFpuPair<WriteFHAddY, [], 7>;
+defm : ZnWriteResFpuPair<WritePHAdd, [], 3>;
+defm : ZnWriteResFpuPair<WritePHAddX, [], 3>;
+defm : ZnWriteResFpuPair<WritePHAddY, [], 3>;
// PCMPGTQ.
def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
@@ -1446,12 +1430,6 @@ def : InstRW<[ZnWriteSHA256RNDS2Ld], (instrs SHA256RNDS2rm)>;
//-- Arithmetic instructions --//
-// HADD, HSUB PS/PD
-def : SchedAlias<WriteFHAdd, ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddLd, ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddY, ZnWriteMicrocoded>;
-def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>;
-
// VDIVPS.
// TODO - convert to ZnWriteResFpuPair
// y,y,y.
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 38908a987595..c47d235eab9b 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -195,7 +195,7 @@ defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
-defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>;
defm : Zn2WriteResPair<WriteShiftCL, [Zn2ALU], 1>;
defm : Zn2WriteResPair<WriteRotate, [Zn2ALU], 1>;
defm : Zn2WriteResPair<WriteRotateCL, [Zn2ALU], 1>;
@@ -219,8 +219,8 @@ defm : X86WriteRes<WriteBitTestRegLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>;
// Bit counts.
-defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
-defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
+defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3, [12], 6, 4, 2>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4, [16], 6, 4, 2>;
defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>;
defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>;
defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>;
@@ -230,7 +230,7 @@ def : InstRW<[WriteMove], (instrs COPY)>;
// BMI1 BEXTR, BMI2 BZHI
defm : Zn2WriteResPair<WriteBEXTR, [Zn2ALU], 1>;
-defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>;
// IDIV
defm : Zn2WriteResPair<WriteDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
@@ -247,23 +247,17 @@ def Zn2WriteIMulH : WriteRes<WriteIMulH, [Zn2Multiplier]>{
let Latency = 3;
let NumMicroOps = 0;
}
-
def : WriteRes<WriteIMulHLd, [Zn2Multiplier]>{
let Latency = !add(Zn2WriteIMulH.Latency, Znver2Model.LoadLatency);
let NumMicroOps = Zn2WriteIMulH.NumMicroOps;
}
-
// Floating point operations
defm : X86WriteRes<WriteFLoad, [Zn2AGU], 8, [1], 1>;
defm : X86WriteRes<WriteFLoadX, [Zn2AGU], 8, [1], 1>;
defm : X86WriteRes<WriteFLoadY, [Zn2AGU], 8, [1], 1>;
defm : X86WriteRes<WriteFMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
-defm : X86WriteRes<WriteFMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteFStore, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreX, [Zn2AGU], 1, [1], 1>;
@@ -271,29 +265,34 @@ defm : X86WriteRes<WriteFStoreY, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNT, [Zn2AGU,Zn2FPU2], 8, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+
defm : X86WriteRes<WriteFMove, [Zn2FPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [Zn2FPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [Zn2FPU], 1, [1], 1>;
defm : X86WriteResUnsupported<WriteFMoveZ>;
-defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFAddY, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddY, [Zn2FPU23], 3>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
-defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU23], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU23], 3>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 1>;
-defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 1>;
-defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 1>;
-defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 1>;
-defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFComX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU01,Zn2FPU2], 3, [1,1], 2>;
+defm : Zn2WriteResFpuPair<WriteFComX, [Zn2FPU01,Zn2FPU2], 3, [1,1], 2>;
defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>;
defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -332,8 +331,8 @@ defm : X86WriteResPairUnsupported<WriteFRndZ>;
defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>;
defm : X86WriteResPairUnsupported<WriteFLogicZ>;
-defm : Zn2WriteResFpuPair<WriteFTest, [Zn2FPU], 1>;
-defm : Zn2WriteResFpuPair<WriteFTestY, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteFTest, [Zn2FPU12], 3, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFTestY, [Zn2FPU12], 3, [2], 1, 7, 1>;
defm : X86WriteResPairUnsupported<WriteFTestZ>;
defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>;
defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
@@ -394,20 +393,23 @@ defm : X86WriteRes<WriteVecMoveToGpr, [Zn2FPU2], 2, [1], 1>;
defm : X86WriteRes<WriteVecMoveFromGpr, [Zn2FPU2], 3, [1], 1>;
defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU2], 1>;
defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>;
defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 1>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU2], 1>;
defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU1], 3, [2], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU1], 3, [2], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
defm : Zn2WriteResFpuPair<WriteVecLogic, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteVecLogicX, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteVecLogicY, [Zn2FPU], 1>;
defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
-defm : Zn2WriteResFpuPair<WriteVecTest, [Zn2FPU12], 1, [2], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteVecTestY, [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTest, [Zn2FPU12], 3, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTestY, [Zn2FPU12], 3, [2], 1, 7, 1>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
defm : Zn2WriteResFpuPair<WriteVecALU, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteVecALUX, [Zn2FPU], 1>;
@@ -440,11 +442,6 @@ defm : Zn2WriteResFpuPair<WritePSADBWY, [Zn2FPU0], 3>;
defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>;
-// Vector Shift Operations
-defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 3>;
-defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
-defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
-
// Vector insert/extract operations.
defm : Zn2WriteResFpuPair<WriteVecInsert, [Zn2FPU], 1>;
@@ -486,12 +483,6 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
def Zn2WriteMicrocoded : SchedWriteRes<[]> {
let Latency = 100;
}
-defm : Zn2WriteResPair<WriteDPPS, [], 15>;
-defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
-defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
-defm : Zn2WriteResPair<WritePHAdd, [], 3>;
-defm : Zn2WriteResPair<WritePHAddX, [], 3>;
-defm : Zn2WriteResPair<WritePHAddY, [], 3>;
def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -1109,6 +1100,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
//-- Arithmetic instructions --//
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+defm : Zn2WriteResFpuPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResFpuPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResFpuPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResFpuPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResFpuPair<WritePHAddY, [], 3>;
+
// PCMPGTQ.
def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1479,6 +1478,7 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
// DPPS.
// x,x,i / v,v,v,i.
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>;
// x,m,i / v,v,m,i.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 5e59081c63b0..78a286ae5b28 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
- SDValue Size, Align Alignment, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
const X86Subtarget &Subtarget =
@@ -67,40 +67,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
// The libc version is likely to be faster for these cases. It can use the
// address value and run time information about the CPU.
if (Alignment < Align(4) || !ConstantSize ||
- ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
- // Check to see if there is a specialized entry-point for memory zeroing.
- ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
-
- if (const char *bzeroName =
- (ValC && ValC->isZero())
- ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
- : nullptr) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
- Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
- Entry.Node = Dst;
- Entry.Ty = IntPtrTy;
- Args.push_back(Entry);
- Entry.Node = Size;
- Args.push_back(Entry);
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(Chain)
- .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol(bzeroName, IntPtr),
- std::move(Args))
- .setDiscardResult();
-
- std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
- return CallResult.second;
- }
-
- // Otherwise have the target-independent code call memset.
+ ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
return SDValue();
- }
uint64_t SizeVal = ConstantSize->getZExtValue();
SDValue InFlag;
@@ -175,7 +143,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
DAG.getConstant(Offset, dl, AddrVT)),
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
- isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+ isVolatile, AlwaysInline,
+ /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
}
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
index dac62973636c..19136ca4f6f5 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,7 +29,7 @@ public:
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
- bool isVolatile,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index dba11e8b4000..3317db891cf0 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -181,17 +181,18 @@ private:
void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
unsigned saveEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
+ MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &Loc);
void restoreEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
+ MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
Register Reg);
void mergePredStateIntoSP(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- unsigned PredStateReg);
+ MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &Loc, unsigned PredStateReg);
unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
- DebugLoc Loc);
+ const DebugLoc &Loc);
void
hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
@@ -203,7 +204,7 @@ private:
bool canHardenRegister(Register Reg);
unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
- DebugLoc Loc);
+ const DebugLoc &Loc);
unsigned hardenPostLoad(MachineInstr &MI);
void hardenReturnInstr(MachineInstr &MI);
void tracePredStateThroughCall(MachineInstr &MI);
@@ -356,8 +357,8 @@ static void canonicalizePHIOperands(MachineFunction &MF) {
int OpIdx = DupIndices.pop_back_val();
// Remove both the block and value operand, again in reverse order to
// preserve indices.
- MI.RemoveOperand(OpIdx + 1);
- MI.RemoveOperand(OpIdx);
+ MI.removeOperand(OpIdx + 1);
+ MI.removeOperand(OpIdx);
}
Preds.clear();
@@ -1500,7 +1501,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
/// as the save so that no PHI nodes are inserted.
unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
- DebugLoc Loc) {
+ const DebugLoc &Loc) {
// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
// what instruction selection does.
Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
@@ -1517,8 +1518,8 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
/// This must be done within the same basic block as the save in order to
/// reliably lower.
void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- Register Reg) {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &Loc, Register Reg) {
BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
++NumInstsInserted;
}
@@ -1528,8 +1529,8 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
/// a way that won't form non-canonical pointers and also will be preserved
/// across normal stack adjustments.
void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- unsigned PredStateReg) {
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &Loc, unsigned PredStateReg) {
Register TmpReg = MRI->createVirtualRegister(PS->RC);
// FIXME: This hard codes a shift distance based on the number of bits needed
// to stay canonical on 64-bit. We should compute this somehow and support
@@ -1549,7 +1550,7 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
/// Extracts the predicate state stored in the high bits of the stack pointer.
unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
- DebugLoc Loc) {
+ const DebugLoc &Loc) {
Register PredStateReg = MRI->createVirtualRegister(PS->RC);
Register TmpReg = MRI->createVirtualRegister(PS->RC);
@@ -1907,7 +1908,7 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
/// register class as `Reg`.
unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
- DebugLoc Loc) {
+ const DebugLoc &Loc) {
assert(canHardenRegister(Reg) && "Cannot harden this register!");
assert(Reg.isVirtual() && "Cannot harden a physical register!");
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index a3d4d04b1e0d..0d091adc8e77 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -21,6 +21,8 @@
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
@@ -247,7 +249,7 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
// FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
// but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
// the following check for Win32 should be removed.
- if (In64BitMode || isTargetWin32())
+ if (Is64Bit || isTargetWin32())
return false;
return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
}
@@ -274,12 +276,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
// introduced with Intel's Nehalem/Silvermont and AMD's Family10h
// micro-architectures respectively.
if (hasSSE42() || hasSSE4A())
- IsUAMem16Slow = false;
+ IsUnalignedMem16Slow = false;
LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
<< ", 3DNowLevel " << X863DNowLevel << ", 64bit "
<< HasX86_64 << "\n");
- if (In64BitMode && !HasX86_64)
+ if (Is64Bit && !HasX86_64)
report_fatal_error("64-bit code requested on a subtarget that doesn't "
"support it!");
@@ -289,7 +291,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
if (StackAlignOverride)
stackAlignment = *StackAlignOverride;
else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
- isTargetNaCl() || In64BitMode)
+ isTargetNaCl() || Is64Bit)
stackAlignment = Align(16);
// Consume the vector width attribute or apply any target specific limit.
@@ -357,7 +359,7 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
}
bool X86Subtarget::enableEarlyIfConversion() const {
- return hasCMov() && X86EarlyIfConv;
+ return canUseCMOV() && X86EarlyIfConv;
}
void X86Subtarget::getPostRAMutations(
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 5d773f0c57df..09a8b1f1aafb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -50,24 +50,14 @@ enum class Style {
} // end namespace PICStyles
class X86Subtarget final : public X86GenSubtargetInfo {
- // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
- // are not a good idea. We should be migrating away from these.
- enum X86ProcFamilyEnum {
- Others,
- IntelAtom
- };
-
enum X86SSEEnum {
- NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
};
enum X863DNowEnum {
NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
};
- /// X86 processor family: Intel Atom, and others
- X86ProcFamilyEnum X86ProcFamily = Others;
-
/// Which PIC style to use
PICStyles::Style PICStyle;
@@ -79,412 +69,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// MMX, 3DNow, 3DNow Athlon, or none supported.
X863DNowEnum X863DNowLevel = NoThreeDNow;
- /// True if the processor supports X87 instructions.
- bool HasX87 = false;
-
- /// True if the processor supports CMPXCHG8B.
- bool HasCmpxchg8b = false;
-
- /// True if this processor has NOPL instruction
- /// (generally pentium pro+).
- bool HasNOPL = false;
-
- /// True if this processor has conditional move instructions
- /// (generally pentium pro+).
- bool HasCMov = false;
-
- /// True if the processor supports X86-64 instructions.
- bool HasX86_64 = false;
-
- /// True if the processor supports POPCNT.
- bool HasPOPCNT = false;
-
- /// True if the processor supports SSE4A instructions.
- bool HasSSE4A = false;
-
- /// Target has AES instructions
- bool HasAES = false;
- bool HasVAES = false;
-
- /// Target has FXSAVE/FXRESTOR instructions
- bool HasFXSR = false;
-
- /// Target has XSAVE instructions
- bool HasXSAVE = false;
-
- /// Target has XSAVEOPT instructions
- bool HasXSAVEOPT = false;
-
- /// Target has XSAVEC instructions
- bool HasXSAVEC = false;
-
- /// Target has XSAVES instructions
- bool HasXSAVES = false;
-
- /// Target has carry-less multiplication
- bool HasPCLMUL = false;
- bool HasVPCLMULQDQ = false;
-
- /// Target has Galois Field Arithmetic instructions
- bool HasGFNI = false;
-
- /// Target has 3-operand fused multiply-add
- bool HasFMA = false;
-
- /// Target has 4-operand fused multiply-add
- bool HasFMA4 = false;
-
- /// Target has XOP instructions
- bool HasXOP = false;
-
- /// Target has TBM instructions.
- bool HasTBM = false;
-
- /// Target has LWP instructions
- bool HasLWP = false;
-
- /// True if the processor has the MOVBE instruction.
- bool HasMOVBE = false;
-
- /// True if the processor has the RDRAND instruction.
- bool HasRDRAND = false;
-
- /// Processor has 16-bit floating point conversion instructions.
- bool HasF16C = false;
-
- /// Processor has FS/GS base insturctions.
- bool HasFSGSBase = false;
-
- /// Processor has LZCNT instruction.
- bool HasLZCNT = false;
-
- /// Processor has BMI1 instructions.
- bool HasBMI = false;
-
- /// Processor has BMI2 instructions.
- bool HasBMI2 = false;
-
- /// Processor has VBMI instructions.
- bool HasVBMI = false;
-
- /// Processor has VBMI2 instructions.
- bool HasVBMI2 = false;
-
- /// Processor has Integer Fused Multiply Add
- bool HasIFMA = false;
-
- /// Processor has RTM instructions.
- bool HasRTM = false;
-
- /// Processor has ADX instructions.
- bool HasADX = false;
-
- /// Processor has SHA instructions.
- bool HasSHA = false;
-
- /// Processor has PRFCHW instructions.
- bool HasPRFCHW = false;
-
- /// Processor has RDSEED instructions.
- bool HasRDSEED = false;
-
- /// Processor has LAHF/SAHF instructions in 64-bit mode.
- bool HasLAHFSAHF64 = false;
-
- /// Processor has MONITORX/MWAITX instructions.
- bool HasMWAITX = false;
-
- /// Processor has Cache Line Zero instruction
- bool HasCLZERO = false;
-
- /// Processor has Cache Line Demote instruction
- bool HasCLDEMOTE = false;
-
- /// Processor has MOVDIRI instruction (direct store integer).
- bool HasMOVDIRI = false;
-
- /// Processor has MOVDIR64B instruction (direct store 64 bytes).
- bool HasMOVDIR64B = false;
-
- /// Processor has ptwrite instruction.
- bool HasPTWRITE = false;
-
- /// Processor has Prefetch with intent to Write instruction
- bool HasPREFETCHWT1 = false;
-
- /// True if SHLD instructions are slow.
- bool IsSHLDSlow = false;
-
- /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
- // PMULUDQ.
- bool IsPMULLDSlow = false;
-
- /// True if the PMADDWD instruction is slow compared to PMULLD.
- bool IsPMADDWDSlow = false;
-
- /// True if unaligned memory accesses of 16-bytes are slow.
- bool IsUAMem16Slow = false;
-
- /// True if unaligned memory accesses of 32-bytes are slow.
- bool IsUAMem32Slow = false;
-
- /// True if SSE operations can have unaligned memory operands.
- /// This may require setting a configuration bit in the processor.
- bool HasSSEUnalignedMem = false;
-
- /// True if this processor has the CMPXCHG16B instruction;
- /// this is true for most x86-64 chips, but not the first AMD chips.
- bool HasCmpxchg16b = false;
-
- /// True if the LEA instruction should be used for adjusting
- /// the stack pointer. This is an optimization for Intel Atom processors.
- bool UseLeaForSP = false;
-
- /// True if POPCNT instruction has a false dependency on the destination register.
- bool HasPOPCNTFalseDeps = false;
-
- /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
- bool HasLZCNTFalseDeps = false;
-
- /// True if its preferable to combine to a single cross-lane shuffle
- /// using a variable mask over multiple fixed shuffles.
- bool HasFastVariableCrossLaneShuffle = false;
-
- /// True if its preferable to combine to a single per-lane shuffle
- /// using a variable mask over multiple fixed shuffles.
- bool HasFastVariablePerLaneShuffle = false;
-
- /// True if vzeroupper instructions should be inserted after code that uses
- /// ymm or zmm registers.
- bool InsertVZEROUPPER = false;
-
- /// True if there is no performance penalty for writing NOPs with up to
- /// 7 bytes.
- bool HasFast7ByteNOP = false;
-
- /// True if there is no performance penalty for writing NOPs with up to
- /// 11 bytes.
- bool HasFast11ByteNOP = false;
-
- /// True if there is no performance penalty for writing NOPs with up to
- /// 15 bytes.
- bool HasFast15ByteNOP = false;
-
- /// True if gather is reasonably fast. This is true for Skylake client and
- /// all AVX-512 CPUs.
- bool HasFastGather = false;
-
- /// True if hardware SQRTSS instruction is at least as fast (latency) as
- /// RSQRTSS followed by a Newton-Raphson iteration.
- bool HasFastScalarFSQRT = false;
-
- /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
- /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
- bool HasFastVectorFSQRT = false;
-
- /// True if 8-bit divisions are significantly faster than
- /// 32-bit divisions and should be used when possible.
- bool HasSlowDivide32 = false;
-
- /// True if 32-bit divides are significantly faster than
- /// 64-bit divisions and should be used when possible.
- bool HasSlowDivide64 = false;
-
- /// True if LZCNT instruction is fast.
- bool HasFastLZCNT = false;
-
- /// True if SHLD based rotate is fast.
- bool HasFastSHLDRotate = false;
-
- /// True if the processor supports macrofusion.
- bool HasMacroFusion = false;
-
- /// True if the processor supports branch fusion.
- bool HasBranchFusion = false;
-
- /// True if the processor has enhanced REP MOVSB/STOSB.
- bool HasERMSB = false;
-
- /// True if the processor has fast short REP MOV.
- bool HasFSRM = false;
-
- /// True if the short functions should be padded to prevent
- /// a stall when returning too early.
- bool PadShortFunctions = false;
-
- /// True if two memory operand instructions should use a temporary register
- /// instead.
- bool SlowTwoMemOps = false;
-
- /// True if the LEA instruction inputs have to be ready at address generation
- /// (AG) time.
- bool LEAUsesAG = false;
-
- /// True if the LEA instruction with certain arguments is slow
- bool SlowLEA = false;
-
- /// True if the LEA instruction has all three source operands: base, index,
- /// and offset or if the LEA instruction uses base and index registers where
- /// the base is EBP, RBP,or R13
- bool Slow3OpsLEA = false;
-
- /// True if INC and DEC instructions are slow when writing to flags
- bool SlowIncDec = false;
-
- /// Processor has AVX-512 PreFetch Instructions
- bool HasPFI = false;
-
- /// Processor has AVX-512 Exponential and Reciprocal Instructions
- bool HasERI = false;
-
- /// Processor has AVX-512 Conflict Detection Instructions
- bool HasCDI = false;
-
- /// Processor has AVX-512 population count Instructions
- bool HasVPOPCNTDQ = false;
-
- /// Processor has AVX-512 Doubleword and Quadword instructions
- bool HasDQI = false;
-
- /// Processor has AVX-512 Byte and Word instructions
- bool HasBWI = false;
-
- /// Processor has AVX-512 Vector Length eXtenstions
- bool HasVLX = false;
-
- /// Processor has AVX-512 16 bit floating-point extenstions
- bool HasFP16 = false;
-
- /// Processor has PKU extenstions
- bool HasPKU = false;
-
- /// Processor has AVX-512 Vector Neural Network Instructions
- bool HasVNNI = false;
-
- /// Processor has AVX Vector Neural Network Instructions
- bool HasAVXVNNI = false;
-
- /// Processor has AVX-512 bfloat16 floating-point extensions
- bool HasBF16 = false;
-
- /// Processor supports ENQCMD instructions
- bool HasENQCMD = false;
-
- /// Processor has AVX-512 Bit Algorithms instructions
- bool HasBITALG = false;
-
- /// Processor has AVX-512 vp2intersect instructions
- bool HasVP2INTERSECT = false;
-
- /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
- /// using Shadow Stack
- bool HasSHSTK = false;
-
- /// Processor supports Invalidate Process-Context Identifier
- bool HasINVPCID = false;
-
- /// Processor has Software Guard Extensions
- bool HasSGX = false;
-
- /// Processor supports Flush Cache Line instruction
- bool HasCLFLUSHOPT = false;
-
- /// Processor supports Cache Line Write Back instruction
- bool HasCLWB = false;
-
- /// Processor supports Write Back No Invalidate instruction
- bool HasWBNOINVD = false;
-
- /// Processor support RDPID instruction
- bool HasRDPID = false;
-
- /// Processor supports WaitPKG instructions
- bool HasWAITPKG = false;
-
- /// Processor supports PCONFIG instruction
- bool HasPCONFIG = false;
-
- /// Processor support key locker instructions
- bool HasKL = false;
-
- /// Processor support key locker wide instructions
- bool HasWIDEKL = false;
-
- /// Processor supports HRESET instruction
- bool HasHRESET = false;
-
- /// Processor supports SERIALIZE instruction
- bool HasSERIALIZE = false;
-
- /// Processor supports TSXLDTRK instruction
- bool HasTSXLDTRK = false;
-
- /// Processor has AMX support
- bool HasAMXTILE = false;
- bool HasAMXBF16 = false;
- bool HasAMXINT8 = false;
-
- /// Processor supports User Level Interrupt instructions
- bool HasUINTR = false;
-
- /// Enable SSE4.2 CRC32 instruction (Used when SSE4.2 is supported but
- /// function is GPR only)
- bool HasCRC32 = false;
-
- /// Processor has a single uop BEXTR implementation.
- bool HasFastBEXTR = false;
-
- /// Try harder to combine to horizontal vector ops if they are fast.
- bool HasFastHorizontalOps = false;
-
- /// Prefer a left/right scalar logical shifts pair over a shift+and pair.
- bool HasFastScalarShiftMasks = false;
-
- /// Prefer a left/right vector logical shifts pair over a shift+and pair.
- bool HasFastVectorShiftMasks = false;
-
- /// Prefer a movbe over a single-use load + bswap / single-use bswap + store.
- bool HasFastMOVBE = false;
-
- /// Use a retpoline thunk rather than indirect calls to block speculative
- /// execution.
- bool UseRetpolineIndirectCalls = false;
-
- /// Use a retpoline thunk or remove any indirect branch to block speculative
- /// execution.
- bool UseRetpolineIndirectBranches = false;
-
- /// Deprecated flag, query `UseRetpolineIndirectCalls` and
- /// `UseRetpolineIndirectBranches` instead.
- bool DeprecatedUseRetpoline = false;
-
- /// When using a retpoline thunk, call an externally provided thunk rather
- /// than emitting one inside the compiler.
- bool UseRetpolineExternalThunk = false;
-
- /// Prevent generation of indirect call/branch instructions from memory,
- /// and force all indirect call/branch instructions from a register to be
- /// preceded by an LFENCE. Also decompose RET instructions into a
- /// POP+LFENCE+JMP sequence.
- bool UseLVIControlFlowIntegrity = false;
-
- /// Enable Speculative Execution Side Effect Suppression
- bool UseSpeculativeExecutionSideEffectSuppression = false;
-
- /// Insert LFENCE instructions to prevent data speculatively injected into
- /// loads from being used maliciously.
- bool UseLVILoadHardening = false;
-
- /// Use an instruction sequence for taking the address of a global that allows
- /// a memory tag in the upper address bits.
- bool AllowTaggedGlobals = false;
-
- /// Use software floating point for code generation.
- bool UseSoftFloat = false;
-
- /// Use alias analysis during code generation.
- bool UseAA = false;
-
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool ATTRIBUTE = DEFAULT;
+#include "X86GenSubtargetInfo.inc"
/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
@@ -496,21 +83,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
// FIXME: this is a known good value for Yonah. How about others?
unsigned MaxInlineSizeThreshold = 128;
- /// Indicates target prefers 128 bit instructions.
- bool Prefer128Bit = false;
-
- /// Indicates target prefers 256 bit instructions.
- bool Prefer256Bit = false;
-
- /// Indicates target prefers AVX512 mask registers.
- bool PreferMaskRegisters = false;
-
- /// Use Silvermont specific arithmetic costs.
- bool UseSLMArithCosts = false;
-
- /// Use Goldmont specific floating point div/sqrt costs.
- bool UseGLMDivSqrtCosts = false;
-
/// What processor and OS we're targeting.
Triple TargetTriple;
@@ -520,7 +92,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
std::unique_ptr<RegisterBankInfo> RegBankInfo;
std::unique_ptr<InstructionSelector> InstSelector;
-private:
/// Override the stack alignment.
MaybeAlign StackAlignOverride;
@@ -534,15 +105,6 @@ private:
/// Required vector width from function attribute.
unsigned RequiredVectorWidth;
- /// True if compiling for 64-bit, false for 16-bit or 32-bit.
- bool In64BitMode = false;
-
- /// True if compiling for 32-bit, false for 16-bit or 64-bit.
- bool In32BitMode = false;
-
- /// True if compiling for 16-bit, false for 32-bit or 64-bit.
- bool In16BitMode = false;
-
X86SelectionDAGInfo TSInfo;
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
// X86TargetLowering needs.
@@ -608,38 +170,32 @@ private:
void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
public:
- /// Is this x86_64? (disregarding specific ABI / programming model)
- bool is64Bit() const {
- return In64BitMode;
- }
- bool is32Bit() const {
- return In32BitMode;
- }
-
- bool is16Bit() const {
- return In16BitMode;
- }
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+ bool GETTER() const { return ATTRIBUTE; }
+#include "X86GenSubtargetInfo.inc"
/// Is this x86_64 with the ILP32 programming model (x32 ABI)?
bool isTarget64BitILP32() const {
- return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
+ return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
}
/// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
bool isTarget64BitLP64() const {
- return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
+ return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
}
PICStyles::Style getPICStyle() const { return PICStyle; }
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
- bool hasX87() const { return HasX87; }
- bool hasCmpxchg8b() const { return HasCmpxchg8b; }
- bool hasNOPL() const { return HasNOPL; }
+ bool canUseCMPXCHG8B() const { return hasCX8(); }
+ bool canUseCMPXCHG16B() const {
+ // CX16 is just the CPUID bit, instruction requires 64-bit mode too.
+ return hasCX16() && is64Bit();
+ }
// SSE codegen depends on cmovs, and all SSE1+ processors support them.
// All 64-bit processors support cmov.
- bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); }
+ bool canUseCMOV() const { return hasCMOV() || hasSSE1() || is64Bit(); }
bool hasSSE1() const { return X86SSELevel >= SSE1; }
bool hasSSE2() const { return X86SSELevel >= SSE2; }
bool hasSSE3() const { return X86SSELevel >= SSE3; }
@@ -648,146 +204,26 @@ public:
bool hasSSE42() const { return X86SSELevel >= SSE42; }
bool hasAVX() const { return X86SSELevel >= AVX; }
bool hasAVX2() const { return X86SSELevel >= AVX2; }
- bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+ bool hasAVX512() const { return X86SSELevel >= AVX512; }
bool hasInt256() const { return hasAVX2(); }
- bool hasSSE4A() const { return HasSSE4A; }
bool hasMMX() const { return X863DNowLevel >= MMX; }
- bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
- bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
- bool hasPOPCNT() const { return HasPOPCNT; }
- bool hasAES() const { return HasAES; }
- bool hasVAES() const { return HasVAES; }
- bool hasFXSR() const { return HasFXSR; }
- bool hasXSAVE() const { return HasXSAVE; }
- bool hasXSAVEOPT() const { return HasXSAVEOPT; }
- bool hasXSAVEC() const { return HasXSAVEC; }
- bool hasXSAVES() const { return HasXSAVES; }
- bool hasPCLMUL() const { return HasPCLMUL; }
- bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
- bool hasGFNI() const { return HasGFNI; }
- // Prefer FMA4 to FMA - its better for commutation/memory folding and
- // has equal or better performance on all supported targets.
- bool hasFMA() const { return HasFMA; }
- bool hasFMA4() const { return HasFMA4; }
+ bool hasThreeDNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool hasThreeDNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
- bool hasXOP() const { return HasXOP; }
- bool hasTBM() const { return HasTBM; }
- bool hasLWP() const { return HasLWP; }
- bool hasMOVBE() const { return HasMOVBE; }
- bool hasRDRAND() const { return HasRDRAND; }
- bool hasF16C() const { return HasF16C; }
- bool hasFSGSBase() const { return HasFSGSBase; }
- bool hasLZCNT() const { return HasLZCNT; }
- bool hasBMI() const { return HasBMI; }
- bool hasBMI2() const { return HasBMI2; }
- bool hasVBMI() const { return HasVBMI; }
- bool hasVBMI2() const { return HasVBMI2; }
- bool hasIFMA() const { return HasIFMA; }
- bool hasRTM() const { return HasRTM; }
- bool hasADX() const { return HasADX; }
- bool hasSHA() const { return HasSHA; }
- bool hasPRFCHW() const { return HasPRFCHW; }
- bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
bool hasPrefetchW() const {
// The PREFETCHW instruction was added with 3DNow but later CPUs gave it
// its own CPUID bit as part of deprecating 3DNow. Intel eventually added
// it and KNL has another that prefetches to L2 cache. We assume the
// L1 version exists if the L2 version does.
- return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
+ return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1();
}
bool hasSSEPrefetch() const {
// We implicitly enable these when we have a write prefix supporting cache
// level OR if we have prfchw, but don't already have a read prefetch from
// 3dnow.
- return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
- }
- bool hasRDSEED() const { return HasRDSEED; }
- bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
- bool hasMWAITX() const { return HasMWAITX; }
- bool hasCLZERO() const { return HasCLZERO; }
- bool hasCLDEMOTE() const { return HasCLDEMOTE; }
- bool hasMOVDIRI() const { return HasMOVDIRI; }
- bool hasMOVDIR64B() const { return HasMOVDIR64B; }
- bool hasPTWRITE() const { return HasPTWRITE; }
- bool isSHLDSlow() const { return IsSHLDSlow; }
- bool isPMULLDSlow() const { return IsPMULLDSlow; }
- bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
- bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
- bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
- bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
- bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
- bool useLeaForSP() const { return UseLeaForSP; }
- bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
- bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
- bool hasFastVariableCrossLaneShuffle() const {
- return HasFastVariableCrossLaneShuffle;
- }
- bool hasFastVariablePerLaneShuffle() const {
- return HasFastVariablePerLaneShuffle;
+ return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1();
}
- bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
- bool hasFastGather() const { return HasFastGather; }
- bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
- bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
- bool hasFastLZCNT() const { return HasFastLZCNT; }
- bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
- bool hasFastBEXTR() const { return HasFastBEXTR; }
- bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
- bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
- bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
- bool hasFastMOVBE() const { return HasFastMOVBE; }
- bool hasMacroFusion() const { return HasMacroFusion; }
- bool hasBranchFusion() const { return HasBranchFusion; }
- bool hasERMSB() const { return HasERMSB; }
- bool hasFSRM() const { return HasFSRM; }
- bool hasSlowDivide32() const { return HasSlowDivide32; }
- bool hasSlowDivide64() const { return HasSlowDivide64; }
- bool padShortFunctions() const { return PadShortFunctions; }
- bool slowTwoMemOps() const { return SlowTwoMemOps; }
- bool LEAusesAG() const { return LEAUsesAG; }
- bool slowLEA() const { return SlowLEA; }
- bool slow3OpsLEA() const { return Slow3OpsLEA; }
- bool slowIncDec() const { return SlowIncDec; }
- bool hasCDI() const { return HasCDI; }
- bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
- bool hasPFI() const { return HasPFI; }
- bool hasERI() const { return HasERI; }
- bool hasDQI() const { return HasDQI; }
- bool hasBWI() const { return HasBWI; }
- bool hasVLX() const { return HasVLX; }
- bool hasFP16() const { return HasFP16; }
- bool hasPKU() const { return HasPKU; }
- bool hasVNNI() const { return HasVNNI; }
- bool hasBF16() const { return HasBF16; }
- bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
- bool hasBITALG() const { return HasBITALG; }
- bool hasSHSTK() const { return HasSHSTK; }
- bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
- bool hasCLWB() const { return HasCLWB; }
- bool hasWBNOINVD() const { return HasWBNOINVD; }
- bool hasRDPID() const { return HasRDPID; }
- bool hasWAITPKG() const { return HasWAITPKG; }
- bool hasPCONFIG() const { return HasPCONFIG; }
- bool hasSGX() const { return HasSGX; }
- bool hasINVPCID() const { return HasINVPCID; }
- bool hasENQCMD() const { return HasENQCMD; }
- bool hasKL() const { return HasKL; }
- bool hasWIDEKL() const { return HasWIDEKL; }
- bool hasHRESET() const { return HasHRESET; }
- bool hasSERIALIZE() const { return HasSERIALIZE; }
- bool hasTSXLDTRK() const { return HasTSXLDTRK; }
- bool hasUINTR() const { return HasUINTR; }
- bool hasCRC32() const { return HasCRC32; }
- bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
- bool useRetpolineIndirectBranches() const {
- return UseRetpolineIndirectBranches;
- }
- bool hasAVXVNNI() const { return HasAVXVNNI; }
- bool hasAMXTILE() const { return HasAMXTILE; }
- bool hasAMXBF16() const { return HasAMXBF16; }
- bool hasAMXINT8() const { return HasAMXINT8; }
- bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
-
+ bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); }
// These are generic getters that OR together all of the thunk types
// supported by the subtarget. Therefore useIndirectThunk*() will return true
// if any respective thunk feature is enabled.
@@ -798,16 +234,6 @@ public:
return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
}
- bool preferMaskRegisters() const { return PreferMaskRegisters; }
- bool useSLMArithCosts() const { return UseSLMArithCosts; }
- bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
- bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
- bool allowTaggedGlobals() const { return AllowTaggedGlobals; }
- bool useLVILoadHardening() const { return UseLVILoadHardening; }
- bool useSpeculativeExecutionSideEffectSuppression() const {
- return UseSpeculativeExecutionSideEffectSuppression;
- }
-
unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
@@ -834,11 +260,6 @@ public:
bool isXRaySupported() const override { return is64Bit(); }
- /// TODO: to be removed later and replaced with suitable properties
- bool isAtom() const { return X86ProcFamily == IntelAtom; }
- bool useSoftFloat() const { return UseSoftFloat; }
- bool useAA() const override { return UseAA; }
-
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
/// no-sse2). There isn't any reason to disable it if the target processor
/// supports it.
@@ -850,7 +271,7 @@ public:
bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
- bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
+ bool isTargetPS() const { return TargetTriple.isPS(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
@@ -890,9 +311,9 @@ public:
bool isOSWindows() const { return TargetTriple.isOSWindows(); }
- bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
+ bool isTargetWin64() const { return Is64Bit && isOSWindows(); }
- bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
+ bool isTargetWin32() const { return !Is64Bit && isOSWindows(); }
bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }
@@ -990,8 +411,6 @@ public:
AntiDepBreakMode getAntiDepBreakMode() const override {
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
-
- bool enableAdvancedRASplitCost() const override { return false; }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index e3d0128dd73d..4249788e3540 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -27,13 +27,16 @@
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -56,6 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableTileRAPass("x86-tile-ra",
+ cl::desc("Enable the tile register allocation pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
// Register the target.
RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
@@ -65,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86LowerAMXIntrinsicsLegacyPassPass(PR);
initializeX86LowerAMXTypeLegacyPassPass(PR);
initializeX86PreAMXConfigPassPass(PR);
+ initializeX86PreTileConfigPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -75,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86TileConfigPass(PR);
+ initializeX86FastPreTileConfigPass(PR);
initializeX86FastTileConfigPass(PR);
initializeX86LowerTileCopyPass(PR);
initializeX86ExpandPseudoPass(PR);
@@ -154,7 +164,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
bool JIT,
Optional<Reloc::Model> RM) {
bool is64Bit = TT.getArch() == Triple::x86_64;
- if (!RM.hasValue()) {
+ if (!RM) {
// JIT codegen should use static relocations by default, since it's
// typically executed in process and not relocatable.
if (JIT)
@@ -218,9 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
OL),
TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
- // On PS4, the "return address" of a 'noreturn' call must still be within
+ // On PS4/PS5, the "return address" of a 'noreturn' call must still be within
// the calling function, and TrapUnreachable is an easy way to get that.
- if (TT.isPS4() || TT.isOSBinFormatMachO()) {
+ if (TT.isPS() || TT.isOSBinFormatMachO()) {
this->Options.TrapUnreachable = true;
this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
}
@@ -333,7 +343,7 @@ bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
//===----------------------------------------------------------------------===//
TargetTransformInfo
-X86TargetMachine::getTargetTransformInfo(const Function &F) {
+X86TargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(X86TTIImpl(this, F));
}
@@ -382,7 +392,7 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
- bool addPreRewrite() override;
+ bool addRegAssignAndRewriteOptimized() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -417,9 +427,6 @@ void X86PassConfig::addIRPasses() {
addPass(createX86LowerAMXIntrinsicsPass());
addPass(createX86LowerAMXTypePass());
- if (TM->getOptLevel() == CodeGenOpt::None)
- addPass(createX86PreAMXConfigPass());
-
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOpt::None) {
@@ -441,6 +448,9 @@ void X86PassConfig::addIRPasses() {
addPass(createCFGuardCheckPass());
}
}
+
+ if (TM->Options.JMCInstrument)
+ addPass(createJMCInstrumenterPass());
}
bool X86PassConfig::addInstSelector() {
@@ -505,9 +515,10 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86FlagsCopyLoweringPass());
addPass(createX86DynAllocaExpander());
- if (getOptLevel() != CodeGenOpt::None) {
+ if (getOptLevel() != CodeGenOpt::None)
addPass(createX86PreTileConfigPass());
- }
+ else
+ addPass(createX86FastPreTileConfigPass());
}
void X86PassConfig::addMachineSSAOptimization() {
@@ -607,11 +618,21 @@ bool X86PassConfig::addPostFastRegAllocRewrite() {
return true;
}
-bool X86PassConfig::addPreRewrite() {
- addPass(createX86TileConfigPass());
- return true;
-}
-
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
+
+static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass &RC) {
+ return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(&RC);
+}
+
+bool X86PassConfig::addRegAssignAndRewriteOptimized() {
+ // Don't support tile RA when RA is specified by command line "-regalloc".
+ if (!isCustomizedRegAlloc() && EnableTileRAPass) {
+ // Allocate tile register first.
+ addPass(createGreedyRegisterAllocator(onlyAllocateTileRegisters));
+ addPass(createX86TileConfigPass());
+ }
+ return TargetPassConfig::addRegAssignAndRewriteOptimized();
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 69d7e48b8977..70df8da77641 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -44,7 +44,7 @@ public:
// attributes of each function.
const X86Subtarget *getSubtargetImpl() const = delete;
- TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+ TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
// Set up the pass pipeline.
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5b95c10332dc..b36f8a3d06d0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
@@ -1223,6 +1224,63 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
+ if (!Mask.empty() && NumOfDests.isValid()) {
+ // Try to perform better estimation of the permutation.
+ // 1. Split the source/destination vectors into real registers.
+ // 2. Do the mask analysis to identify which real registers are
+ // permuted. If more than 1 source registers are used for the
+ // destination register building, the cost for this destination register
+ // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
+ // source register is used, build mask and calculate the cost as a cost
+ // of PermuteSingleSrc.
+ // Also, for the single register permute we try to identify if the
+ // destination register is just a copy of the source register or the
+ // copy of the previous destination register (the cost is
+ // TTI::TCC_Basic). If the source register is just reused, the cost for
+ // this operation is 0.
+ unsigned E = *NumOfDests.getValue();
+ unsigned NormalizedVF =
+ LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
+ unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
+ SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
+ copy(Mask, NormalizedMask.begin());
+ unsigned PrevSrcReg = 0;
+ ArrayRef<int> PrevRegMask;
+ InstructionCost Cost = 0;
+ processShuffleMasks(
+ NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
+ [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
+ &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
+ if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
+ // Check if the previous register can be just copied to the next
+ // one.
+ if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
+ PrevRegMask != RegMask)
+ Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+ RegMask, 0, nullptr);
+ else
+ // Just a copy of previous destination register.
+ Cost += TTI::TCC_Basic;
+ return;
+ }
+ if (SrcReg != DestReg &&
+ any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
+ // Just a copy of the source register.
+ Cost += TTI::TCC_Basic;
+ }
+ PrevSrcReg = SrcReg;
+ PrevRegMask = RegMask;
+ },
+ [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
+ unsigned /*Unused*/,
+ unsigned /*Unused*/) {
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
+ 0, nullptr);
+ });
+ return Cost;
+ }
+
InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
None, 0, nullptr);
@@ -1545,9 +1603,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
- if (ST->hasSSE2())
+ static const CostTblEntry SSE3BroadcastLoadTbl[] = {
+ {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
+ };
+
+ if (ST->hasSSE2()) {
+ bool IsLoad =
+ llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
+ if (ST->hasSSE3() && IsLoad)
+ if (const auto *Entry =
+ CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
+ assert(isLegalBroadcastLoad(BaseTp->getElementType(),
+ LT.second.getVectorElementCount()) &&
+ "Table entry missing from isLegalBroadcastLoad()");
+ return LT.first * Entry->Cost;
+ }
+
if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
+ }
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
@@ -2444,6 +2518,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
std::pair<InstructionCost, MVT> LTDest =
TLI->getTypeLegalizationCost(DL, Dst);
+ // If we're truncating to the same legalized type - just assume its free.
+ if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
+ return TTI::TCC_Free;
+
if (ST->useAVX512Regs()) {
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(
@@ -2545,7 +2623,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- unsigned ExtraCost = 0;
+ InstructionCost ExtraCost = 0;
if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
// Some vector comparison predicates cost extra instructions.
// TODO: Should we invert this and assume worst case cmp costs
@@ -2619,15 +2697,29 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16f32, 1 },
{ ISD::SELECT, MVT::v8i64, 1 },
+ { ISD::SELECT, MVT::v4i64, 1 },
+ { ISD::SELECT, MVT::v2i64, 1 },
{ ISD::SELECT, MVT::v16i32, 1 },
+ { ISD::SELECT, MVT::v8i32, 1 },
+ { ISD::SELECT, MVT::v4i32, 1 },
{ ISD::SELECT, MVT::v8f64, 1 },
+ { ISD::SELECT, MVT::v4f64, 1 },
+ { ISD::SELECT, MVT::v2f64, 1 },
+ { ISD::SELECT, MVT::f64, 1 },
{ ISD::SELECT, MVT::v16f32, 1 },
+ { ISD::SELECT, MVT::v8f32 , 1 },
+ { ISD::SELECT, MVT::v4f32, 1 },
+ { ISD::SELECT, MVT::f32 , 1 },
{ ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
{ ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
- { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
- { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v32i16, 2 },
+ { ISD::SELECT, MVT::v16i16, 1 },
+ { ISD::SELECT, MVT::v8i16, 1 },
+ { ISD::SELECT, MVT::v64i8, 2 },
+ { ISD::SELECT, MVT::v32i8, 1 },
+ { ISD::SELECT, MVT::v16i8, 1 },
};
static const CostTblEntry AVX2CostTbl[] = {
@@ -2636,10 +2728,12 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i16, 1 },
{ ISD::SETCC, MVT::v32i8, 1 },
- { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
- { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
- { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
- { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
+ { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb
};
static const CostTblEntry AVX1CostTbl[] = {
@@ -2651,49 +2745,54 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i16, 4 },
{ ISD::SETCC, MVT::v32i8, 4 },
- { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
- { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
- { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
- { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
+ { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd
+ { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps
+ { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd
+ { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps
{ ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
{ ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
};
static const CostTblEntry SSE42CostTbl[] = {
- { ISD::SETCC, MVT::v2f64, 1 },
- { ISD::SETCC, MVT::v4f32, 1 },
{ ISD::SETCC, MVT::v2i64, 1 },
};
static const CostTblEntry SSE41CostTbl[] = {
- { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
- { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
- { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
- { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
- { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
- { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+
+ { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd
+ { ISD::SELECT, MVT::f64, 2 }, // blendvpd
+ { ISD::SELECT, MVT::v4f32, 2 }, // blendvps
+ { ISD::SELECT, MVT::f32 , 2 }, // blendvps
+ { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb
+ { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb
};
static const CostTblEntry SSE2CostTbl[] = {
{ ISD::SETCC, MVT::v2f64, 2 },
{ ISD::SETCC, MVT::f64, 1 },
- { ISD::SETCC, MVT::v2i64, 8 },
+ { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion
{ ISD::SETCC, MVT::v4i32, 1 },
{ ISD::SETCC, MVT::v8i16, 1 },
{ ISD::SETCC, MVT::v16i8, 1 },
- { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
- { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
- { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
+ { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por
};
static const CostTblEntry SSE1CostTbl[] = {
{ ISD::SETCC, MVT::v4f32, 2 },
{ ISD::SETCC, MVT::f32, 1 },
- { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
+ { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
+ { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
};
if (ST->useSLMArithCosts())
@@ -3555,7 +3654,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
assert(Val->isVectorTy() && "This must be a vector type");
Type *ScalarType = Val->getScalarType();
- int RegisterFileMoveCost = 0;
+ InstructionCost RegisterFileMoveCost = 0;
// Non-immediate extraction/insertion can be handled as a sequence of
// aliased loads+stores via the stack.
@@ -3589,6 +3688,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
if (Index != -1U && (Opcode == Instruction::ExtractElement ||
Opcode == Instruction::InsertElement)) {
+ // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
+ if (Opcode == Instruction::ExtractElement &&
+ ScalarType->getScalarSizeInBits() == 1 &&
+ cast<FixedVectorType>(Val)->getNumElements() > 1)
+ return 1;
+
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -3597,15 +3702,16 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return 0;
// The type may be split. Normalize the index to the new type.
+ unsigned SizeInBits = LT.second.getSizeInBits();
unsigned NumElts = LT.second.getVectorNumElements();
unsigned SubNumElts = NumElts;
Index = Index % NumElts;
// For >128-bit vectors, we need to extract higher 128-bit subvectors.
// For inserts, we also need to insert the subvector back.
- if (LT.second.getSizeInBits() > 128) {
- assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
- unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ if (SizeInBits > 128) {
+ assert((SizeInBits % 128) == 0 && "Illegal vector");
+ unsigned NumSubVecs = SizeInBits / 128;
SubNumElts = NumElts / NumSubVecs;
if (SubNumElts <= Index) {
RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
@@ -3673,20 +3779,25 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
const APInt &DemandedElts,
bool Insert,
bool Extract) {
+ assert(DemandedElts.getBitWidth() ==
+ cast<FixedVectorType>(Ty)->getNumElements() &&
+ "Vector size mismatch");
+
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ MVT MScalarTy = LT.second.getScalarType();
+ unsigned SizeInBits = LT.second.getSizeInBits();
+
InstructionCost Cost = 0;
// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
- std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- MVT MScalarTy = LT.second.getScalarType();
-
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
(MScalarTy.isInteger() && ST->hasSSE41()) ||
(MScalarTy == MVT::f32 && ST->hasSSE41())) {
// For types we can insert directly, insertion into 128-bit sub vectors is
// cheap, followed by a cheap chain of concatenations.
- if (LT.second.getSizeInBits() <= 128) {
+ if (SizeInBits <= 128) {
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
@@ -3704,9 +3815,9 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
// Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
const int CostValue = *LT.first.getValue();
assert(CostValue >= 0 && "Negative cost!");
- unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
+ unsigned Num128Lanes = SizeInBits / 128 * CostValue;
unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
- APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+ APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
unsigned Scale = NumElts / Num128Lanes;
// We iterate each 128-lane, and check if we need a
// extracti128/inserti128 for this 128-lane.
@@ -3747,10 +3858,59 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
}
}
- // TODO: Use default extraction for now, but we should investigate extending this
- // to handle repeated subvector extraction.
- if (Extract)
+ if (Extract) {
+ // vXi1 can be efficiently extracted with MOVMSK.
+ // TODO: AVX512 predicate mask handling.
+ // NOTE: This doesn't work well for roundtrip scalarization.
+ if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
+ unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
+ return MOVMSKCost;
+ }
+
+ if (LT.second.isVector()) {
+ int CostValue = *LT.first.getValue();
+ assert(CostValue >= 0 && "Negative cost!");
+
+ unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
+ assert(NumElts >= DemandedElts.getBitWidth() &&
+ "Vector has been legalized to smaller element count");
+
+ // If we're extracting elements from a 128-bit subvector lane, we only need
+ // to extract each lane once, not for every element.
+ if (SizeInBits > 128) {
+ assert((SizeInBits % 128) == 0 && "Illegal vector");
+ unsigned NumLegal128Lanes = SizeInBits / 128;
+ unsigned Num128Lanes = NumLegal128Lanes * CostValue;
+ APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
+ unsigned Scale = NumElts / Num128Lanes;
+
+ // Add cost for each demanded 128-bit subvector extraction.
+ // Luckily this is a lot easier than for insertion.
+ APInt DemandedUpper128Lanes =
+ APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes);
+ auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale);
+ for (unsigned I = 0; I != Num128Lanes; ++I)
+ if (DemandedUpper128Lanes[I])
+ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
+ I * Scale, Ty128);
+
+ // Add all the demanded element extractions together, but adjust the
+ // index to use the equivalent of the bottom 128 bit lane.
+ for (unsigned I = 0; I != NumElts; ++I)
+ if (WidenedDemandedElts[I]) {
+ unsigned Idx = I % Scale;
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
+ }
+
+ return Cost;
+ }
+ }
+
+ // Fallback to default extraction.
Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+ }
return Cost;
}
@@ -3855,8 +4015,7 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
// if all elements that will form a single Dst vector aren't demanded,
// then we won't need to do that shuffle, so adjust the cost accordingly.
APInt DemandedDstVectors = APIntOps::ScaleBitMask(
- DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
- NumDstVectors);
+ DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
InstructionCost SingleShuffleCost =
@@ -5029,8 +5188,8 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2) {
+bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) {
// X86 specific here are "instruction number 1st priority".
return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
C1.NumIVMuls, C1.NumBaseAdds,
@@ -5110,6 +5269,14 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
return true;
}
+bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
+ ElementCount NumElements) const {
+ // movddup
+ return ST->hasSSE3() && !NumElements.isScalable() &&
+ NumElements.getFixedValue() == 2 &&
+ ElementTy == Type::getDoubleTy(ElementTy->getContext());
+}
+
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
if (!isa<VectorType>(DataTy))
return false;
@@ -5174,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
return IntWidth == 32 || IntWidth == 64;
}
+bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
+ unsigned Opcode1,
+ const SmallBitVector &OpcodeMask) const {
+ // ADDSUBPS 4xf32 SSE3
+ // VADDSUBPS 4xf32 AVX
+ // VADDSUBPS 8xf32 AVX2
+ // ADDSUBPD 2xf64 SSE3
+ // VADDSUBPD 2xf64 AVX
+ // VADDSUBPD 4xf64 AVX2
+
+ unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+ assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
+ if (!isPowerOf2_32(NumElements))
+ return false;
+ // Check the opcode pattern. We apply the mask on the opcode arguments and
+ // then check if it is what we expect.
+ for (int Lane : seq<int>(0, NumElements)) {
+ unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
+ // We expect FSub for even lanes and FAdd for odd lanes.
+ if (Lane % 2 == 0 && Opc != Instruction::FSub)
+ return false;
+ if (Lane % 2 == 1 && Opc != Instruction::FAdd)
+ return false;
+ }
+ // Now check that the pattern is supported by the target ISA.
+ Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
+ if (ElemTy->isFloatTy())
+ return ST->hasSSE3() && NumElements % 4 == 0;
+ if (ElemTy->isDoubleTy())
+ return ST->hasSSE3() && NumElements % 2 == 0;
+ return false;
+}
+
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
// AVX2 doesn't support scatter
if (!ST->hasAVX512())
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 69715072426f..bd3c3fb1bb2f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -38,12 +38,12 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const FeatureBitset InlineFeatureIgnoreList = {
// This indicates the CPU is 64 bit capable not that we are in 64-bit
// mode.
- X86::Feature64Bit,
+ X86::FeatureX86_64,
// These features don't have any intrinsics or ABI effect.
X86::FeatureNOPL,
- X86::FeatureCMPXCHG16B,
- X86::FeatureLAHFSAHF,
+ X86::FeatureCX16,
+ X86::FeatureLAHFSAHF64,
// Some older targets can be setup to fold unaligned loads.
X86::FeatureSSEUnalignedMem,
@@ -68,6 +68,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningMacroFusion,
X86::TuningPadShortFunctions,
X86::TuningPOPCNTFalseDeps,
+ X86::TuningMULCFalseDeps,
+ X86::TuningPERMFalseDeps,
+ X86::TuningRANGEFalseDeps,
+ X86::TuningGETMANTFalseDeps,
+ X86::TuningMULLQFalseDeps,
X86::TuningSlow3OpsLEA,
X86::TuningSlowDivide32,
X86::TuningSlowDivide64,
@@ -131,7 +136,8 @@ public:
const Instruction *CxtI = nullptr);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<const Value *> Args = None);
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
@@ -219,13 +225,14 @@ public:
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind);
- bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2);
+ bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType, Align Alignment);
bool isLegalMaskedStore(Type *DataType, Align Alignment);
bool isLegalNTLoad(Type *DataType, Align Alignment);
bool isLegalNTStore(Type *DataType, Align Alignment);
+ bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const;
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
return forceScalarizeMaskedGather(VTy, Alignment);
@@ -234,6 +241,8 @@ public:
bool isLegalMaskedScatter(Type *DataType, Align Alignment);
bool isLegalMaskedExpandLoad(Type *DataType);
bool isLegalMaskedCompressStore(Type *DataType);
+ bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+ const SmallBitVector &OpcodeMask) const;
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 8114a0b2d423..5cada924e006 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -36,7 +36,7 @@
using namespace llvm;
-#define DEBUG_TYPE "tile-config"
+#define DEBUG_TYPE "tileconfig"
namespace {
@@ -70,11 +70,11 @@ struct X86TileConfig : public MachineFunctionPass {
char X86TileConfig::ID = 0;
-INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+INITIALIZE_PASS_BEGIN(X86TileConfig, DEBUG_TYPE, "Tile Register Configure",
false, false)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
- false, false)
+INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
+ false)
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
@@ -90,7 +90,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
int SS = INT_MAX;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == X86::LDTILECFG) {
+ if (MI.getOpcode() == X86::PLDTILECFGV) {
SS = MI.getOperand(0).getIndex();
break;
}
@@ -98,6 +98,9 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
if (SS != INT_MAX)
break;
}
+ // Didn't find PLDTILECFGV, just return false;
+ if (SS == INT_MAX)
+ return false;
// Try to find a point to insert MIs for constant shapes.
// Here we are leveraging the palette id inserted in PreRA pass.
@@ -120,6 +123,8 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
continue;
if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
continue;
+ if (VRM.getPhys(VirtReg) == VirtRegMap::NO_PHYS_REG)
+ continue;
unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0;
if (!Phys2Virt[Index])
Phys2Virt[Index] = VirtReg;