diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
commit | 54db30ce18663e6c2991958f3b5d18362e8e93c4 (patch) | |
tree | 4aa6442802570767398cc83ba484e97b1309bdc2 /contrib/llvm/lib/Target/X86 | |
parent | 35284c22e9c8348159b7ce032ea45f2cdeb65298 (diff) | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) | |
download | src-54db30ce18663e6c2991958f3b5d18362e8e93c4.tar.gz src-54db30ce18663e6c2991958f3b5d18362e8e93c4.zip |
Merge llvm trunk r366426, resolve conflicts, and update FREEBSD-Xlist.
Notes
Notes:
svn path=/projects/clang900-import/; revision=351344
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
148 files changed, 18014 insertions, 14118 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp deleted file mode 100644 index 2c376fd062ca..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ /dev/null @@ -1,1089 +0,0 @@ -//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "X86AsmInstrumentation.h" -#include "MCTargetDesc/X86MCTargetDesc.h" -#include "X86Operand.h" -#include "llvm/ADT/Triple.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDwarf.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstBuilder.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCParser/MCTargetAsmParser.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCTargetOptions.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/SMLoc.h" -#include <algorithm> -#include <cassert> -#include <cstdint> -#include <limits> -#include <memory> -#include <vector> - -// Following comment describes how assembly instrumentation works. -// Currently we have only AddressSanitizer instrumentation, but we're -// planning to implement MemorySanitizer for inline assembly too. If -// you're not familiar with AddressSanitizer algorithm, please, read -// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm -// -// When inline assembly is parsed by an instance of X86AsmParser, all -// instructions are emitted via EmitInstruction method. That's the -// place where X86AsmInstrumentation analyzes an instruction and -// decides, whether the instruction should be emitted as is or -// instrumentation is required. The latter case happens when an -// instruction reads from or writes to memory. Now instruction opcode -// is explicitly checked, and if an instruction has a memory operand -// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be -// instrumented. There're also exist instructions that modify -// memory but don't have an explicit memory operands, for instance, -// movs. -// -// Let's consider at first 8-byte memory accesses when an instruction -// has an explicit memory operand. In this case we need two registers - -// AddressReg to compute address of a memory cells which are accessed -// and ShadowReg to compute corresponding shadow address. So, we need -// to spill both registers before instrumentation code and restore them -// after instrumentation. Thus, in general, instrumentation code will -// look like this: -// PUSHF # Store flags, otherwise they will be overwritten -// PUSH AddressReg # spill AddressReg -// PUSH ShadowReg # spill ShadowReg -// LEA MemOp, AddressReg # compute address of the memory operand -// MOV AddressReg, ShadowReg -// SHR ShadowReg, 3 -// # ShadowOffset(AddressReg >> 3) contains address of a shadow -// # corresponding to MemOp. -// CMP ShadowOffset(ShadowReg), 0 # test shadow value -// JZ .Done # when shadow equals to zero, everything is fine -// MOV AddressReg, RDI -// # Call __asan_report function with AddressReg as an argument -// CALL __asan_report -// .Done: -// POP ShadowReg # Restore ShadowReg -// POP AddressReg # Restore AddressReg -// POPF # Restore flags -// -// Memory accesses with different size (1-, 2-, 4- and 16-byte) are -// handled in a similar manner, but small memory accesses (less than 8 -// byte) require an additional ScratchReg, which is used for shadow value. -// -// If, suppose, we're instrumenting an instruction like movs, only -// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize * -// RCX are checked. In this case there're no need to spill and restore -// AddressReg , ShadowReg or flags four times, they're saved on stack -// just once, before instrumentation of these four addresses, and restored -// at the end of the instrumentation. -// -// There exist several things which complicate this simple algorithm. -// * Instrumented memory operand can have RSP as a base or an index -// register. So we need to add a constant offset before computation -// of memory address, since flags, AddressReg, ShadowReg, etc. were -// already stored on stack and RSP was modified. -// * Debug info (usually, DWARF) should be adjusted, because sometimes -// RSP is used as a frame register. So, we need to select some -// register as a frame register and temprorary override current CFA -// register. - -using namespace llvm; - -static cl::opt<bool> ClAsanInstrumentAssembly( - "asan-instrument-assembly", - cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, - cl::init(false)); - -static const int64_t MinAllowedDisplacement = - std::numeric_limits<int32_t>::min(); -static const int64_t MaxAllowedDisplacement = - std::numeric_limits<int32_t>::max(); - -static int64_t ApplyDisplacementBounds(int64_t Displacement) { - return std::max(std::min(MaxAllowedDisplacement, Displacement), - MinAllowedDisplacement); -} - -static void CheckDisplacementBounds(int64_t Displacement) { - assert(Displacement >= MinAllowedDisplacement && - Displacement <= MaxAllowedDisplacement); -} - -static bool IsStackReg(unsigned Reg) { - return Reg == X86::RSP || Reg == X86::ESP; -} - -static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } - -namespace { - -class X86AddressSanitizer : public X86AsmInstrumentation { -public: - struct RegisterContext { - private: - enum RegOffset { - REG_OFFSET_ADDRESS = 0, - REG_OFFSET_SHADOW, - REG_OFFSET_SCRATCH - }; - - public: - RegisterContext(unsigned AddressReg, unsigned ShadowReg, - unsigned ScratchReg) { - BusyRegs.push_back(convReg(AddressReg, 64)); - BusyRegs.push_back(convReg(ShadowReg, 64)); - BusyRegs.push_back(convReg(ScratchReg, 64)); - } - - unsigned AddressReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); - } - - unsigned ShadowReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); - } - - unsigned ScratchReg(unsigned Size) const { - return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); - } - - void AddBusyReg(unsigned Reg) { - if (Reg != X86::NoRegister) - BusyRegs.push_back(convReg(Reg, 64)); - } - - void AddBusyRegs(const X86Operand &Op) { - AddBusyReg(Op.getMemBaseReg()); - AddBusyReg(Op.getMemIndexReg()); - } - - unsigned ChooseFrameReg(unsigned Size) const { - static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, - X86::RCX, X86::RDX, X86::RDI, - X86::RSI }; - for (unsigned Reg : Candidates) { - if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) - return convReg(Reg, Size); - } - return X86::NoRegister; - } - - private: - unsigned convReg(unsigned Reg, unsigned Size) const { - return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); - } - - std::vector<unsigned> BusyRegs; - }; - - X86AddressSanitizer(const MCSubtargetInfo *&STI) - : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - - ~X86AddressSanitizer() override = default; - - // X86AsmInstrumentation implementation: - void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, - MCStreamer &Out, - /* unused */ bool) override { - InstrumentMOVS(Inst, Operands, Ctx, MII, Out); - if (RepPrefix) - EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); - - InstrumentMOV(Inst, Operands, Ctx, MII, Out); - - RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX); - if (!RepPrefix) - EmitInstruction(Out, Inst); - } - - // Adjusts up stack and saves all registers used in instrumentation. - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) = 0; - - // Restores all registers used in instrumentation and adjusts stack. - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) = 0; - - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, MCStreamer &Out) = 0; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, MCStreamer &Out) = 0; - - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) = 0; - - void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, - MCStreamer &Out); - void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg, - unsigned AccessSize, MCContext &Ctx, MCStreamer &Out); - - void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - -protected: - void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } - - void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { - assert(Size == 32 || Size == 64); - MCInst Inst; - Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, - unsigned Reg, MCContext &Ctx, MCStreamer &Out); - - // Creates new memory operand with Displacement added to an original - // displacement. Residue will contain a residue which could happen when the - // total displacement exceeds 32-bit limitation. - std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op, - int64_t Displacement, - MCContext &Ctx, int64_t *Residue); - - bool is64BitMode() const { - return STI->getFeatureBits()[X86::Mode64Bit]; - } - - bool is32BitMode() const { - return STI->getFeatureBits()[X86::Mode32Bit]; - } - - bool is16BitMode() const { - return STI->getFeatureBits()[X86::Mode16Bit]; - } - - unsigned getPointerWidth() { - if (is16BitMode()) return 16; - if (is32BitMode()) return 32; - if (is64BitMode()) return 64; - llvm_unreachable("invalid mode"); - } - - // True when previous instruction was actually REP prefix. - bool RepPrefix; - - // Offset from the original SP register. - int64_t OrigSPOffset; -}; - -void X86AddressSanitizer::InstrumentMemOperand( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - assert(Op.isMem() && "Op should be a memory operand."); - assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && - "AccessSize should be a power of two, less or equal than 16."); - // FIXME: take into account load/store alignment. - if (IsSmallMemAccess(AccessSize)) - InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); - else - InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, - unsigned CntReg, - unsigned AccessSize, - MCContext &Ctx, MCStreamer &Out) { - // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)] - // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)]. - RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */, - IsSmallMemAccess(AccessSize) - ? X86::RBX - : X86::NoRegister /* ScratchReg */); - RegCtx.AddBusyReg(DstReg); - RegCtx.AddBusyReg(SrcReg); - RegCtx.AddBusyReg(CntReg); - - InstrumentMemOperandPrologue(RegCtx, Ctx, Out); - - // Test (%SrcReg) - { - const MCExpr *Disp = MCConstantExpr::create(0, Ctx); - std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); - InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, - Out); - } - - // Test -1(%SrcReg, %CntReg, AccessSize) - { - const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); - std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), - SMLoc())); - InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, - Out); - } - - // Test (%DstReg) - { - const MCExpr *Disp = MCConstantExpr::create(0, Ctx); - std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); - InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); - } - - // Test -1(%DstReg, %CntReg, AccessSize) - { - const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); - std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( - getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), - SMLoc())); - InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); - } - - InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, const MCInstrInfo &MII, - MCStreamer &Out) { - // Access size in bytes. - unsigned AccessSize = 0; - - switch (Inst.getOpcode()) { - case X86::MOVSB: - AccessSize = 1; - break; - case X86::MOVSW: - AccessSize = 2; - break; - case X86::MOVSL: - AccessSize = 4; - break; - case X86::MOVSQ: - AccessSize = 8; - break; - default: - return; - } - - InstrumentMOVSImpl(AccessSize, Ctx, Out); -} - -void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, - OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) { - // Access size in bytes. - unsigned AccessSize = 0; - - switch (Inst.getOpcode()) { - case X86::MOV8mi: - case X86::MOV8mr: - case X86::MOV8rm: - AccessSize = 1; - break; - case X86::MOV16mi: - case X86::MOV16mr: - case X86::MOV16rm: - AccessSize = 2; - break; - case X86::MOV32mi: - case X86::MOV32mr: - case X86::MOV32rm: - AccessSize = 4; - break; - case X86::MOV64mi32: - case X86::MOV64mr: - case X86::MOV64rm: - AccessSize = 8; - break; - case X86::MOVAPDmr: - case X86::MOVAPSmr: - case X86::MOVAPDrm: - case X86::MOVAPSrm: - AccessSize = 16; - break; - default: - return; - } - - const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); - - for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { - assert(Operands[Ix]); - MCParsedAsmOperand &Op = *Operands[Ix]; - if (Op.isMem()) { - X86Operand &MemOp = static_cast<X86Operand &>(Op); - RegisterContext RegCtx( - X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */, - IsSmallMemAccess(AccessSize) ? X86::RCX - : X86::NoRegister /* ScratchReg */); - RegCtx.AddBusyRegs(MemOp); - InstrumentMemOperandPrologue(RegCtx, Ctx, Out); - InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out); - InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); - } - } -} - -void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, - unsigned Size, - unsigned Reg, MCContext &Ctx, - MCStreamer &Out) { - int64_t Displacement = 0; - if (IsStackReg(Op.getMemBaseReg())) - Displacement -= OrigSPOffset; - if (IsStackReg(Op.getMemIndexReg())) - Displacement -= OrigSPOffset * Op.getMemScale(); - - assert(Displacement >= 0); - - // Emit Op as is. - if (Displacement == 0) { - EmitLEA(Op, Size, Reg, Out); - return; - } - - int64_t Residue; - std::unique_ptr<X86Operand> NewOp = - AddDisplacement(Op, Displacement, Ctx, &Residue); - EmitLEA(*NewOp, Size, Reg, Out); - - while (Residue != 0) { - const MCConstantExpr *Disp = - MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx); - std::unique_ptr<X86Operand> DispOp = - X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), - SMLoc()); - EmitLEA(*DispOp, Size, Reg, Out); - Residue -= Disp->getValue(); - } -} - -std::unique_ptr<X86Operand> -X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, - MCContext &Ctx, int64_t *Residue) { - assert(Displacement >= 0); - - if (Displacement == 0 || - (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { - *Residue = Displacement; - return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), - Op.getMemDisp(), Op.getMemBaseReg(), - Op.getMemIndexReg(), Op.getMemScale(), - SMLoc(), SMLoc()); - } - - int64_t OrigDisplacement = - static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue(); - CheckDisplacementBounds(OrigDisplacement); - Displacement += OrigDisplacement; - - int64_t NewDisplacement = ApplyDisplacementBounds(Displacement); - CheckDisplacementBounds(NewDisplacement); - - *Residue = Displacement - NewDisplacement; - const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx); - return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp, - Op.getMemBaseReg(), Op.getMemIndexReg(), - Op.getMemScale(), SMLoc(), SMLoc()); -} - -class X86AddressSanitizer32 : public X86AddressSanitizer { -public: - static const long kShadowOffset = 0x20000000; - - X86AddressSanitizer32(const MCSubtargetInfo *&STI) - : X86AddressSanitizer(STI) {} - - ~X86AddressSanitizer32() override = default; - - unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { - unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); - if (FrameReg == X86::NoRegister) - return FrameReg; - return getX86SubSuperRegister(FrameReg, 32); - } - - void SpillReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg)); - OrigSPOffset -= 4; - } - - void RestoreReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg)); - OrigSPOffset += 4; - } - - void StoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); - OrigSPOffset -= 4; - } - - void RestoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - OrigSPOffset += 4; - } - - void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); - assert(LocalFrameReg != X86::NoRegister); - - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (MRI && FrameReg != X86::NoRegister) { - SpillReg(Out, LocalFrameReg); - if (FrameReg == X86::ESP) { - Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */); - Out.EmitCFIRelOffset( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); - } - EmitInstruction( - Out, - MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg)); - Out.EmitCFIRememberState(); - Out.EmitCFIDefCfaRegister( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); - } - - SpillReg(Out, RegCtx.AddressReg(32)); - SpillReg(Out, RegCtx.ShadowReg(32)); - if (RegCtx.ScratchReg(32) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(32)); - StoreFlags(Out); - } - - void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); - assert(LocalFrameReg != X86::NoRegister); - - RestoreFlags(Out); - if (RegCtx.ScratchReg(32) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(32)); - RestoreReg(Out, RegCtx.ShadowReg(32)); - RestoreReg(Out, RegCtx.AddressReg(32)); - - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { - RestoreReg(Out, LocalFrameReg); - Out.EmitCFIRestoreState(); - if (FrameReg == X86::ESP) - Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */); - } - } - - void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; - -private: - void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out, const RegisterContext &RegCtx) { - EmitInstruction(Out, MCInstBuilder(X86::CLD)); - EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - - EmitInstruction(Out, MCInstBuilder(X86::AND32ri8) - .addReg(X86::ESP) - .addReg(X86::ESP) - .addImm(-16)); - EmitInstruction( - Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - - MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + - (IsWrite ? "store" : "load") + - Twine(AccessSize)); - const MCSymbolRefExpr *FnExpr = - MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); - } -}; - -void X86AddressSanitizer32::InstrumentMemOperandSmall( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - - assert(RegCtx.ScratchReg(32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - - ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) - .addReg(ShadowRegI32) - .addReg(ShadowRegI32) - .addImm(3)); - - { - MCInst Inst; - Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::createReg(ShadowRegI8)); - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - EmitInstruction( - Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(7)); - - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 1: - break; - case 2: { - const MCExpr *Disp = MCConstantExpr::create(1, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 32, ScratchRegI32, Out); - break; - } - case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(3)); - break; - } - - EmitInstruction( - Out, - MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); - EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( - ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer32::InstrumentMemOperandLarge( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - - ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) - .addReg(ShadowRegI32) - .addReg(ShadowRegI32) - .addImm(3)); - { - MCInst Inst; - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - } - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::createImm(0)); - EmitInstruction(Out, Inst); - } - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, - MCContext &Ctx, - MCStreamer &Out) { - StoreFlags(Out); - - // No need to test when ECX is equals to zero. - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction( - Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - // Instrument first and last elements in src and dst range. - InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, - X86::ECX /* CntReg */, AccessSize, Ctx, Out); - - EmitLabel(Out, DoneSym); - RestoreFlags(Out); -} - -class X86AddressSanitizer64 : public X86AddressSanitizer { -public: - static const long kShadowOffset = 0x7fff8000; - - X86AddressSanitizer64(const MCSubtargetInfo *&STI) - : X86AddressSanitizer(STI) {} - - ~X86AddressSanitizer64() override = default; - - unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { - unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); - if (FrameReg == X86::NoRegister) - return FrameReg; - return getX86SubSuperRegister(FrameReg, 64); - } - - void SpillReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg)); - OrigSPOffset -= 8; - } - - void RestoreReg(MCStreamer &Out, unsigned Reg) { - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg)); - OrigSPOffset += 8; - } - - void StoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - OrigSPOffset -= 8; - } - - void RestoreFlags(MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - OrigSPOffset += 8; - } - - void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); - assert(LocalFrameReg != X86::NoRegister); - - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (MRI && FrameReg != X86::NoRegister) { - SpillReg(Out, X86::RBP); - if (FrameReg == X86::RSP) { - Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */); - Out.EmitCFIRelOffset( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); - } - EmitInstruction( - Out, - MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg)); - Out.EmitCFIRememberState(); - Out.EmitCFIDefCfaRegister( - MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); - } - - EmitAdjustRSP(Ctx, Out, -128); - SpillReg(Out, RegCtx.ShadowReg(64)); - SpillReg(Out, RegCtx.AddressReg(64)); - if (RegCtx.ScratchReg(64) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(64)); - StoreFlags(Out); - } - - void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); - assert(LocalFrameReg != X86::NoRegister); - - RestoreFlags(Out); - if (RegCtx.ScratchReg(64) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(64)); - RestoreReg(Out, RegCtx.AddressReg(64)); - RestoreReg(Out, RegCtx.ShadowReg(64)); - EmitAdjustRSP(Ctx, Out, 128); - - unsigned FrameReg = GetFrameReg(Ctx, Out); - if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { - RestoreReg(Out, LocalFrameReg); - Out.EmitCFIRestoreState(); - if (FrameReg == X86::RSP) - Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */); - } - } - - void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; - -private: - void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { - const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 64, X86::RSP, Out); - OrigSPOffset += Offset; - } - - void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out, const RegisterContext &RegCtx) { - EmitInstruction(Out, MCInstBuilder(X86::CLD)); - EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) - .addReg(X86::RSP) - .addReg(X86::RSP) - .addImm(-16)); - - if (RegCtx.AddressReg(64) != X86::RDI) { - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( - RegCtx.AddressReg(64))); - } - MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + - (IsWrite ? "store" : "load") + - Twine(AccessSize)); - const MCSymbolRefExpr *FnExpr = - MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); - } -}; - -} // end anonymous namespace - -void X86AddressSanitizer64::InstrumentMemOperandSmall( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(64); - unsigned AddressRegI32 = RegCtx.AddressReg(32); - unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - - assert(RegCtx.ScratchReg(32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - - ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( - AddressRegI64)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) - .addReg(ShadowRegI64) - .addReg(ShadowRegI64) - .addImm(3)); - { - MCInst Inst; - Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::createReg(ShadowRegI8)); - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - - EmitInstruction( - Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( - AddressRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(7)); - - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 1: - break; - case 2: { - const MCExpr *Disp = MCConstantExpr::create(1, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, - SMLoc(), SMLoc())); - EmitLEA(*Op, 32, ScratchRegI32, Out); - break; - } - case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) - .addReg(ScratchRegI32) - .addReg(ScratchRegI32) - .addImm(3)); - break; - } - - EmitInstruction( - Out, - MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); - EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( - ShadowRegI32)); - EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer64::InstrumentMemOperandLarge( - X86Operand &Op, unsigned AccessSize, bool IsWrite, - const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(64); - unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - - ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); - - EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( - AddressRegI64)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) - .addReg(ShadowRegI64) - .addReg(ShadowRegI64) - .addImm(3)); - { - MCInst Inst; - switch (AccessSize) { - default: llvm_unreachable("Incorrect access size"); - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - } - const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); - std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, - SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::createImm(0)); - EmitInstruction(Out, Inst); - } - - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); - EmitLabel(Out, DoneSym); -} - -void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, - MCContext &Ctx, - MCStreamer &Out) { - StoreFlags(Out); - - // No need to test when RCX is equals to zero. - MCSymbol *DoneSym = Ctx.createTempSymbol(); - const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); - EmitInstruction( - Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - - // Instrument first and last elements in src and dst range. - InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, - X86::RCX /* CntReg */, AccessSize, Ctx, Out); - - EmitLabel(Out, DoneSym); - RestoreFlags(Out); -} - -X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) - : STI(STI) {} - -X86AsmInstrumentation::~X86AsmInstrumentation() = default; - -void X86AsmInstrumentation::InstrumentAndEmitInstruction( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) { - EmitInstruction(Out, Inst, PrintSchedInfoEnabled); -} - -void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst, - bool PrintSchedInfoEnabled) { - Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled); -} - -unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, - MCStreamer &Out) { - if (!Out.getNumFrameInfos()) // No active dwarf frame - return X86::NoRegister; - const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back(); - if (Frame.End) // Active dwarf frame is closed - return X86::NoRegister; - const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); - if (!MRI) // No register info - return X86::NoRegister; - - if (InitialFrameReg) { - // FrameReg is set explicitly, we're instrumenting a MachineFunction. - return InitialFrameReg; - } - - return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */); -} - -X86AsmInstrumentation * -llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI) { - Triple T(STI->getTargetTriple()); - const bool hasCompilerRTSupport = T.isOSLinux(); - if (ClAsanInstrumentAssembly && hasCompilerRTSupport && - MCOptions.SanitizeAddress) { - if (STI->getFeatureBits()[X86::Mode32Bit] != 0) - return new X86AddressSanitizer32(STI); - if (STI->getFeatureBits()[X86::Mode64Bit] != 0) - return new X86AddressSanitizer64(STI); - } - return new X86AsmInstrumentation(STI); -} diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h deleted file mode 100644 index 42a9dc3ba26a..000000000000 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ /dev/null @@ -1,68 +0,0 @@ -//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H -#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H - -#include "llvm/ADT/SmallVector.h" -#include <memory> - -namespace llvm { - -class MCContext; -class MCInst; -class MCInstrInfo; -class MCParsedAsmOperand; -class MCStreamer; -class MCSubtargetInfo; -class MCTargetOptions; -class X86AsmInstrumentation; - -X86AsmInstrumentation * -CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI); - -class X86AsmInstrumentation { -public: - virtual ~X86AsmInstrumentation(); - - // Sets frame register corresponding to a current frame. - void SetInitialFrameRegister(unsigned RegNo) { - InitialFrameReg = RegNo; - } - - // Tries to instrument and emit instruction. - virtual void InstrumentAndEmitInstruction( - const MCInst &Inst, - SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out, - bool PrintSchedInfoEnabled); - -protected: - friend X86AsmInstrumentation * - CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, - const MCSubtargetInfo *&STI); - - X86AsmInstrumentation(const MCSubtargetInfo *&STI); - - unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); - - void EmitInstruction(MCStreamer &Out, const MCInst &Inst, - bool PrintSchedInfoEnabled = false); - - const MCSubtargetInfo *&STI; - - unsigned InitialFrameReg = 0; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 81391b96d126..95cbf46d37ed 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1,17 +1,16 @@ //===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86IntelInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86IntelInstPrinter.h" #include "MCTargetDesc/X86MCExpr.h" #include "MCTargetDesc/X86TargetStreamer.h" -#include "X86AsmInstrumentation.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" #include "llvm/ADT/STLExtras.h" @@ -71,9 +70,17 @@ static const char OpPrecedence[] = { class X86AsmParser : public MCTargetAsmParser { ParseInstructionInfo *InstInfo; - std::unique_ptr<X86AsmInstrumentation> Instrumentation; bool Code16GCC; + enum VEXEncoding { + VEXEncoding_Default, + VEXEncoding_VEX2, + VEXEncoding_VEX3, + VEXEncoding_EVEX, + }; + + VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -90,13 +97,14 @@ private: } unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst, - uint64_t &ErrorInfo, bool matchingInlineAsm, - unsigned VariantID = 0) { + uint64_t &ErrorInfo, FeatureBitset &MissingFeatures, + bool matchingInlineAsm, unsigned VariantID = 0) { // In Code16GCC mode, match as 32-bit. if (Code16GCC) SwitchMode(X86::Mode32Bit); unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo, - matchingInlineAsm, VariantID); + MissingFeatures, matchingInlineAsm, + VariantID); if (Code16GCC) SwitchMode(X86::Mode16Bit); return rv; @@ -840,6 +848,8 @@ private: const SMLoc &StartLoc, SMLoc &EndLoc); + X86::CondCode ParseConditionCode(StringRef CCode); + bool ParseIntelMemoryOperandSize(unsigned &Size); std::unique_ptr<X86Operand> CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, @@ -860,6 +870,8 @@ private: bool parseDirectiveFPOEndProc(SMLoc L); bool parseDirectiveFPOData(SMLoc L); + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); @@ -875,7 +887,7 @@ private: void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands, MCStreamer &Out, bool MatchingInlineAsm); - bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures, bool MatchingInlineAsm); bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -914,7 +926,7 @@ private: MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; - uint64_t FB = ComputeAvailableFeatures( + FeatureBitset FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); @@ -941,6 +953,9 @@ private: /// } public: + enum X86MatchResultTy { + Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY, + }; X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) @@ -951,14 +966,10 @@ public: // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); - Instrumentation.reset( - CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - void SetFrameRegister(unsigned RegNo) override; - bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -1193,10 +1204,6 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } -void X86AsmParser::SetFrameRegister(unsigned RegNo) { - Instrumentation->SetInitialFrameRegister(RegNo); -} - std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { bool Parse32 = is32BitMode() || Code16GCC; unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI); @@ -1655,6 +1662,8 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) { const AsmToken &Tok = Parser.getTok(); // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); + if (Tok.isNot(AsmToken::Identifier)) + return ErrorOperand(Tok.getLoc(), "Expected an identifier after {"); if (Tok.getIdentifier().startswith("r")){ int rndMode = StringSwitch<int>(Tok.getIdentifier()) .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) @@ -1998,6 +2007,29 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { } } +// X86::COND_INVALID if not a recognized condition code or alternate mnemonic, +// otherwise the EFLAGS Condition Code enumerator. +X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { + return StringSwitch<X86::CondCode>(CC) + .Case("o", X86::COND_O) // Overflow + .Case("no", X86::COND_NO) // No Overflow + .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal + .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below + .Cases("e", "z", X86::COND_E) // Equal/Zero + .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero + .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above + .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal + .Case("s", X86::COND_S) // Sign + .Case("ns", X86::COND_NS) // No Sign + .Cases("p", "pe", X86::COND_P) // Parity/Parity Even + .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd + .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal + .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less + .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater + .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal + .Default(X86::COND_INVALID); +} + // true on failure, false otherwise // If no {z} mark was found - Parser doesn't advance bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z, @@ -2304,18 +2336,64 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); InstInfo = &Info; + + // Reset the forced VEX encoding. + ForcedVEXEncoding = VEXEncoding_Default; + + // Parse pseudo prefixes. + while (1) { + if (Name == "{") { + if (getLexer().isNot(AsmToken::Identifier)) + return Error(Parser.getTok().getLoc(), "Unexpected token after '{'"); + std::string Prefix = Parser.getTok().getString().lower(); + Parser.Lex(); // Eat identifier. + if (getLexer().isNot(AsmToken::RCurly)) + return Error(Parser.getTok().getLoc(), "Expected '}'"); + Parser.Lex(); // Eat curly. + + if (Prefix == "vex2") + ForcedVEXEncoding = VEXEncoding_VEX2; + else if (Prefix == "vex3") + ForcedVEXEncoding = VEXEncoding_VEX3; + else if (Prefix == "evex") + ForcedVEXEncoding = VEXEncoding_EVEX; + else + return Error(NameLoc, "unknown prefix"); + + NameLoc = Parser.getTok().getLoc(); + if (getLexer().is(AsmToken::LCurly)) { + Parser.Lex(); + Name = "{"; + } else { + if (getLexer().isNot(AsmToken::Identifier)) + return Error(Parser.getTok().getLoc(), "Expected identifier"); + // FIXME: The mnemonic won't match correctly if its not in lower case. + Name = Parser.getTok().getString(); + Parser.Lex(); + } + continue; + } + + break; + } + StringRef PatchedName = Name; - if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) && - isParsingIntelSyntax() && isParsingInlineAsm()) { + // Hack to skip "short" following Jcc. + if (isParsingIntelSyntax() && + (PatchedName == "jmp" || PatchedName == "jc" || PatchedName == "jnc" || + PatchedName == "jcxz" || PatchedName == "jexcz" || + (PatchedName.startswith("j") && + ParseConditionCode(PatchedName.substr(1)) != X86::COND_INVALID))) { StringRef NextTok = Parser.getTok().getString(); if (NextTok == "short") { SMLoc NameEndLoc = NameLoc.getFromPointer(NameLoc.getPointer() + Name.size()); - // Eat the short keyword + // Eat the short keyword. Parser.Lex(); - // MS ignores the short keyword, it determines the jmp type based - // on the distance of the label + // MS and GAS ignore the short keyword; they both determine the jmp type + // based on the distance of the label. (NASM does emit different code with + // and without "short," though.) InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc, NextTok.size() + 1); } @@ -2326,13 +2404,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName != "setb" && PatchedName != "setnb") PatchedName = PatchedName.substr(0, Name.size()-1); + unsigned ComparisonPredicate = ~0U; + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; unsigned CCIdx = IsVCMP ? 4 : 3; - unsigned ComparisonCode = StringSwitch<unsigned>( + unsigned CC = StringSwitch<unsigned>( PatchedName.slice(CCIdx, PatchedName.size() - 2)) .Case("eq", 0x00) .Case("eq_oq", 0x00) @@ -2382,26 +2462,29 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) { - - Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx), - NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + if (CC != ~0U && (IsVCMP || CC < 8)) { + if (PatchedName.endswith("ss")) + PatchedName = IsVCMP ? "vcmpss" : "cmpss"; + else if (PatchedName.endswith("sd")) + PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; + else if (PatchedName.endswith("ps")) + PatchedName = IsVCMP ? "vcmpps" : "cmpps"; + else if (PatchedName.endswith("pd")) + PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + else + llvm_unreachable("Unexpected suffix!"); - PatchedName = PatchedName.substr(PatchedName.size() - 2); + ComparisonPredicate = CC; } } // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}. if (PatchedName.startswith("vpcmp") && - (PatchedName.endswith("b") || PatchedName.endswith("w") || - PatchedName.endswith("d") || PatchedName.endswith("q"))) { - unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; - unsigned ComparisonCode = StringSwitch<unsigned>( - PatchedName.slice(5, PatchedName.size() - CCIdx)) + (PatchedName.back() == 'b' || PatchedName.back() == 'w' || + PatchedName.back() == 'd' || PatchedName.back() == 'q')) { + unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned CC = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - SuffixSize)) .Case("eq", 0x0) // Only allowed on unsigned. Checked below. .Case("lt", 0x1) .Case("le", 0x2) @@ -2411,24 +2494,26 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("nle", 0x6) //.Case("true", 0x7) // Not a documented alias. .Default(~0U); - if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) { - Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); - - PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + if (CC != ~0U && (CC != 0 || SuffixSize == 2)) { + switch (PatchedName.back()) { + default: llvm_unreachable("Unexpected character!"); + case 'b': PatchedName = SuffixSize == 2 ? "vpcmpub" : "vpcmpb"; break; + case 'w': PatchedName = SuffixSize == 2 ? "vpcmpuw" : "vpcmpw"; break; + case 'd': PatchedName = SuffixSize == 2 ? "vpcmpud" : "vpcmpd"; break; + case 'q': PatchedName = SuffixSize == 2 ? "vpcmpuq" : "vpcmpq"; break; + } + // Set up the immediate to push into the operands later. + ComparisonPredicate = CC; } } // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}. if (PatchedName.startswith("vpcom") && - (PatchedName.endswith("b") || PatchedName.endswith("w") || - PatchedName.endswith("d") || PatchedName.endswith("q"))) { - unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; - unsigned ComparisonCode = StringSwitch<unsigned>( - PatchedName.slice(5, PatchedName.size() - CCIdx)) + (PatchedName.back() == 'b' || PatchedName.back() == 'w' || + PatchedName.back() == 'd' || PatchedName.back() == 'q')) { + unsigned SuffixSize = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned CC = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - SuffixSize)) .Case("lt", 0x0) .Case("le", 0x1) .Case("gt", 0x2) @@ -2438,14 +2523,16 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("false", 0x6) .Case("true", 0x7) .Default(~0U); - if (ComparisonCode != ~0U) { - Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc)); - - const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, - getParser().getContext()); - Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); - - PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + if (CC != ~0U) { + switch (PatchedName.back()) { + default: llvm_unreachable("Unexpected character!"); + case 'b': PatchedName = SuffixSize == 2 ? "vpcomub" : "vpcomb"; break; + case 'w': PatchedName = SuffixSize == 2 ? "vpcomuw" : "vpcomw"; break; + case 'd': PatchedName = SuffixSize == 2 ? "vpcomud" : "vpcomd"; break; + case 'q': PatchedName = SuffixSize == 2 ? "vpcomuq" : "vpcomq"; break; + } + // Set up the immediate to push into the operands later. + ComparisonPredicate = CC; } } @@ -2488,6 +2575,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Flags = X86::IP_NO_PREFIX; break; } + // FIXME: The mnemonic won't match correctly if its not in lower case. Name = Parser.getTok().getString(); Parser.Lex(); // eat the prefix // Hack: we could have something like "rep # some comment" or @@ -2495,6 +2583,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, while (Name.startswith(";") || Name.startswith("\n") || Name.startswith("#") || Name.startswith("\t") || Name.startswith("/")) { + // FIXME: The mnemonic won't match correctly if its not in lower case. Name = Parser.getTok().getString(); Parser.Lex(); // go to next prefix or instr } @@ -2518,6 +2607,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + // Push the immediate if we extracted one from the mnemonic. + if (ComparisonPredicate != ~0U && !isParsingIntelSyntax()) { + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + } + // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as @@ -2552,6 +2648,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return TokError("unexpected token in argument list"); } + // Push the immediate if we extracted one from the mnemonic. + if (ComparisonPredicate != ~0U && isParsingIntelSyntax()) { + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonPredicate, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + } + // Consume the EndOfStatement or the prefix separator Slash if (getLexer().is(AsmToken::EndOfStatement) || (isPrefix && getLexer().is(AsmToken::Slash))) @@ -2575,13 +2678,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); } - // Moving a 32 or 16 bit value into a segment register has the same - // behavior. Modify such instructions to always take shorter form. if ((Name == "mov" || Name == "movw" || Name == "movl") && (Operands.size() == 3)) { X86Operand &Op1 = (X86Operand &)*Operands[1]; X86Operand &Op2 = (X86Operand &)*Operands[2]; SMLoc Loc = Op1.getEndLoc(); + // Moving a 32 or 16 bit value into a segment register has the same + // behavior. Modify such instructions to always take shorter form. if (Op1.isReg() && Op2.isReg() && X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains( Op2.getReg()) && @@ -2758,7 +2861,69 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { - return false; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + + switch (Inst.getOpcode()) { + default: return false; + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + // We can get a smaller encoding by using VEX.R instead of VEX.B if one of + // the registers is extended, but other isn't. + if (ForcedVEXEncoding == VEXEncoding_VEX3 || + MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 || + MRI->getEncodingValue(Inst.getOperand(1).getReg()) < 8) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + // We can get a smaller encoding by using VEX.R instead of VEX.B if one of + // the registers is extended, but other isn't. + if (ForcedVEXEncoding == VEXEncoding_VEX3 || + MRI->getEncodingValue(Inst.getOperand(0).getReg()) >= 8 || + MRI->getEncodingValue(Inst.getOperand(2).getReg()) < 8) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + } } bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { @@ -2864,9 +3029,7 @@ static const char *getSubtargetFeatureName(uint64_t Val); void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Instrumentation->InstrumentAndEmitInstruction( - Inst, Operands, getContext(), MII, Out, - getParser().shouldPrintSchedInfo()); + Out.EmitInstruction(Inst, getSTI()); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2906,17 +3069,16 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, } } -bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, +bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, + const FeatureBitset &MissingFeatures, bool MatchingInlineAsm) { - assert(ErrorInfo && "Unknown missing feature!"); + assert(MissingFeatures.any() && "Unknown missing feature!"); SmallString<126> Msg; raw_svector_ostream OS(Msg); OS << "instruction requires:"; - uint64_t Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) - OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); - Mask <<= 1; + for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) { + if (MissingFeatures[i]) + OS << ' ' << getSubtargetFeatureName(i); } return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm); } @@ -2931,30 +3093,70 @@ static unsigned getPrefixes(OperandVector &Operands) { return Result; } +unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { + unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &MCID = MII.get(Opc); + + if (ForcedVEXEncoding == VEXEncoding_EVEX && + (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX) + return Match_Unsupported; + + if ((ForcedVEXEncoding == VEXEncoding_VEX2 || + ForcedVEXEncoding == VEXEncoding_VEX3) && + (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) + return Match_Unsupported; + + // These instructions match ambiguously with their VEX encoded counterparts + // and appear first in the matching table. Reject them unless we're forcing + // EVEX encoding. + // FIXME: We really need a way to break the ambiguity. + switch (Opc) { + case X86::VCVTSD2SIZrm_Int: + case X86::VCVTSD2SI64Zrm_Int: + case X86::VCVTSS2SIZrm_Int: + case X86::VCVTSS2SI64Zrm_Int: + case X86::VCVTTSD2SIZrm: case X86::VCVTTSD2SIZrm_Int: + case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int: + case X86::VCVTTSS2SIZrm: case X86::VCVTTSS2SIZrm_Int: + case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int: + if (ForcedVEXEncoding != VEXEncoding_EVEX) + return Match_Unsupported; + } + + return Match_Success; +} + bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); SMRange EmptyRange = None; // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); - - bool WasOriginallyInvalidOperand = false; + MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, + Out, MatchingInlineAsm); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); unsigned Prefixes = getPrefixes(Operands); MCInst Inst; + // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the + // encoder. + if (ForcedVEXEncoding == VEXEncoding_VEX3) + Prefixes |= X86::IP_USE_VEX3; + if (Prefixes) Inst.setFlags(Prefixes); // First, try a direct match. - switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm, - isParsingIntelSyntax())) { + FeatureBitset MissingFeatures; + unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo, + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); + switch (OriginalError) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) @@ -2972,13 +3174,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, Opcode = Inst.getOpcode(); return false; case Match_MissingFeature: - return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm); + return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm); case Match_InvalidOperand: - WasOriginallyInvalidOperand = true; - break; case Match_MnemonicFail: + case Match_Unsupported: break; } + if (Op.getToken().empty()) { + Error(IDLoc, "instruction must have size higher than 0", EmptyRange, + MatchingInlineAsm); + return true; + } // FIXME: Ideally, we would only attempt suffix matches for things which are // valid prefixes, and we could just infer the right unambiguous @@ -3002,16 +3208,17 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // Check for the various suffix matches. uint64_t ErrorInfoIgnore; - uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings. unsigned Match[4]; for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { Tmp.back() = Suffixes[I]; Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match[I] == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the old token. @@ -3061,11 +3268,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // If all of the instructions reported an invalid mnemonic, then the original // mnemonic was invalid. if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { - if (!WasOriginallyInvalidOperand) { + if (OriginalError == Match_MnemonicFail) return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", Op.getLocRange(), MatchingInlineAsm); - } + if (OriginalError == Match_Unsupported) + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + + assert(OriginalError == Match_InvalidOperand && "Unexpected error"); // Recover location info for the operand if we know which was the problem. if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) @@ -3084,12 +3295,19 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MatchingInlineAsm); } + // If one instruction matched as unsupported, report this as unsupported. + if (std::count(std::begin(Match), std::end(Match), + Match_Unsupported) == 1) { + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + } + // If one instruction matched with a missing feature, report this as a // missing feature. if (std::count(std::begin(Match), std::end(Match), Match_MissingFeature) == 1) { - ErrorInfo = ErrorInfoMissingFeature; - return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + ErrorInfo = Match_MissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); } @@ -3113,18 +3331,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - StringRef Mnemonic = Op.getToken(); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken(); SMRange EmptyRange = None; - StringRef Base = Op.getToken(); + StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken(); unsigned Prefixes = getPrefixes(Operands); // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); MCInst Inst; + // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the + // encoder. + if (ForcedVEXEncoding == VEXEncoding_VEX3) + Prefixes |= X86::IP_USE_VEX3; + if (Prefixes) Inst.setFlags(Prefixes); @@ -3153,7 +3376,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } SmallVector<unsigned, 8> Match; - uint64_t ErrorInfoMissingFeature = 0; + FeatureBitset ErrorInfoMissingFeatures; + FeatureBitset MissingFeatures; // If unsized push has immediate operand we should default the default pointer // size for the size. @@ -3173,7 +3397,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, Op.setTokenValue(Tmp); // Do match in ATT mode to allow explicit suffix usage. Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo, - MatchingInlineAsm, + MissingFeatures, MatchingInlineAsm, false /*isParsingIntelSyntax()*/)); Op.setTokenValue(Base); } @@ -3190,13 +3414,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t ErrorInfoIgnore; unsigned LastOpcode = Inst.getOpcode(); unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); if (Match.empty() || LastOpcode != Inst.getOpcode()) Match.push_back(M); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the size of the unsized memory operand if we modified it. @@ -3208,10 +3433,11 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // matching with the unsized operand. if (Match.empty()) { Match.push_back(MatchInstruction( - Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())); + Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax())); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfo; + ErrorInfoMissingFeatures = MissingFeatures; } // Restore the size of the unsized memory operand if we modified it. @@ -3233,7 +3459,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, UnsizedMemOp->getMemFrontendSize()) { UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize(); unsigned M = MatchInstruction( - Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()); + Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, + isParsingIntelSyntax()); if (M == Match_Success) NumSuccessfulMatches = 1; @@ -3269,12 +3496,19 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, UnsizedMemOp->getLocRange()); } + // If one instruction matched as unsupported, report this as unsupported. + if (std::count(std::begin(Match), std::end(Match), + Match_Unsupported) == 1) { + return Error(IDLoc, "unsupported instruction", EmptyRange, + MatchingInlineAsm); + } + // If one instruction matched with a missing feature, report this as a // missing feature. if (std::count(std::begin(Match), std::end(Match), Match_MissingFeature) == 1) { - ErrorInfo = ErrorInfoMissingFeature; - return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + ErrorInfo = Match_MissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); } diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index c45a3f14ef11..5bc979d1f18c 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -1,9 +1,8 @@ //===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h index 4d4aae0a1c6a..a771ba366318 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -1,16 +1,15 @@ //===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H -#include "InstPrinter/X86IntelInstPrinter.h" +#include "MCTargetDesc/X86IntelInstPrinter.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86AsmParserCommon.h" #include "llvm/ADT/STLExtras.h" @@ -452,6 +451,31 @@ struct X86Operand final : public MCParsedAsmOperand { X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg())); } + bool isVK1Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg()); + } + + bool isVK2Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK2RegClassID].contains(getReg()); + } + + bool isVK4Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK4RegClassID].contains(getReg()); + } + + bool isVK8Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK8RegClassID].contains(getReg()); + } + + bool isVK16Pair() const { + return Kind == Register && + X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg()); + } + void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) @@ -483,6 +507,30 @@ struct X86Operand final : public MCParsedAsmOperand { addExpr(Inst, getImm()); } + void addMaskPairOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Reg = getReg(); + switch (Reg) { + case X86::K0: + case X86::K1: + Reg = X86::K0_K1; + break; + case X86::K2: + case X86::K3: + Reg = X86::K2_K3; + break; + case X86::K4: + case X86::K5: + Reg = X86::K4_K5; + break; + case X86::K6: + case X86::K7: + Reg = X86::K6_K7; + break; + } + Inst.addOperand(MCOperand::createReg(Reg)); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBaseReg())); diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 62312777318e..9a635bbe5f85 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -1,9 +1,8 @@ //===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -76,6 +75,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86DisassemblerDecoder.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -446,211 +446,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case ENCODING_IO: break; } - } else if (type == TYPE_IMM3) { - // Check for immediates that printSSECC can't handle. - if (immediate >= 8) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; - case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; - case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; - case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; - case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; - case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; - case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; - case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; - case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break; - case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break; - case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break; - case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break; - case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break; - case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break; - case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break; - case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break; - case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break; - case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break; - case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break; - case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break; - case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break; - case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break; - case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break; - case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } - } else if (type == TYPE_IMM5) { - // Check for immediates that printAVXCC can't handle. - if (immediate >= 32) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; - case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; - case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; - case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; - case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; - case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; - case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; - case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; - case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; - case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; - case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; - case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; - case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; - case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; - case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; - case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; - case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; - case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; - case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break; - case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break; - case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break; - case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break; - case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break; - case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break; - case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break; - case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break; - case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break; - case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break; - case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } - } else if (type == TYPE_AVX512ICC) { - if (immediate >= 8 || ((immediate & 0x3) == 3)) { - unsigned NewOpc; - switch (mcInst.getOpcode()) { - default: llvm_unreachable("unexpected opcode"); - case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break; - case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break; - case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break; - case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break; - case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break; - case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break; - case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break; - case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break; - case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break; - case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break; - case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break; - case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break; - case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break; - case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break; - case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break; - case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break; - case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break; - case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break; - case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break; - case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break; - case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break; - case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break; - case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break; - case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break; - case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break; - case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break; - case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break; - case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break; - case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break; - case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break; - case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break; - case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break; - case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break; - case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break; - case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break; - case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break; - case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break; - case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break; - case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break; - case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break; - case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break; - case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break; - case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break; - case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break; - case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break; - case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break; - case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break; - case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break; - case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break; - case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break; - case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break; - case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break; - case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break; - case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break; - case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break; - case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break; - case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break; - case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break; - case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break; - case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break; - case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break; - case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break; - case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break; - case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break; - case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break; - case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break; - case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break; - case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break; - case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break; - case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break; - case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break; - case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break; - case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break; - case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break; - case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break; - case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break; - case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break; - case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break; - case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break; - case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break; - case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break; - case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break; - case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break; - case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break; - case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break; - case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break; - case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break; - case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break; - case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break; - case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break; - case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break; - case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break; - case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break; - case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break; - case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break; - case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break; - case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break; - case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break; - case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break; - case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break; - case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break; - case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break; - case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break; - case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break; - case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break; - case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break; - case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break; - case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break; - case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break; - case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break; - case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break; - case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break; - case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break; - case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break; - case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break; - case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break; - case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break; - case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break; - case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break; - case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break; - } - // Switch opcode to the one that doesn't get special printing. - mcInst.setOpcode(NewOpc); - } } switch (type) { @@ -899,6 +694,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM: case TYPE_YMM: case TYPE_ZMM: + case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: case TYPE_CONTROLREG: @@ -987,6 +783,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, case ENCODING_Rv: translateRegister(mcInst, insn.opcodeRegister); return false; + case ENCODING_CC: + mcInst.addOperand(MCOperand::createImm(insn.immediates[1])); + return false; case ENCODING_FP: translateFPRegister(mcInst, insn.modRM & 7); return false; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 54d550b60652..a241362a271d 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1,9 +1,8 @@ //===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -377,8 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || nextByte == 0xc6 || nextByte == 0xc7)) { insn->xAcquireRelease = true; - if (nextByte != 0x90) // PAUSE instruction support - break; + break; } if (isREX(insn, nextByte)) { uint8_t nnextByte; @@ -884,7 +882,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXK; if (lFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXL; + attrMask |= ATTR_VEXL; if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXL2; } else if (insn->vectorExtensionType == TYPE_VEX_3B) { @@ -1470,6 +1468,10 @@ static int readModRM(struct InternalInstruction* insn) { if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ + case TYPE_VK_PAIR: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_K0_K1 + (index / 2); \ case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ case TYPE_SEGMENTREG: \ @@ -1847,6 +1849,9 @@ static int readOperands(struct InternalInstruction* insn) { if (readOpcodeRegister(insn, 0)) return -1; break; + case ENCODING_CC: + insn->immediates[1] = insn->opcode & 0xf; + break; case ENCODING_FP: break; case ENCODING_VVVV: diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 3b8a4f732eed..7c0a42c019e3 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -1,9 +1,8 @@ //===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -325,6 +324,12 @@ namespace X86Disassembler { ENTRY(K6) \ ENTRY(K7) +#define REGS_MASK_PAIRS \ + ENTRY(K0_K1) \ + ENTRY(K2_K3) \ + ENTRY(K4_K5) \ + ENTRY(K6_K7) + #define REGS_SEGMENT \ ENTRY(ES) \ ENTRY(CS) \ @@ -394,6 +399,7 @@ namespace X86Disassembler { REGS_YMM \ REGS_ZMM \ REGS_MASKS \ + REGS_MASK_PAIRS \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp deleted file mode 100644 index 3a074818c762..000000000000 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ /dev/null @@ -1,213 +0,0 @@ -//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes code for rendering MCInst instances as AT&T-style -// assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86ATTInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "X86InstComments.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -#include <cassert> -#include <cinttypes> -#include <cstdint> - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -// Include the auto-generated portion of the assembly writer. -#define PRINT_ALIAS_INSTR -#include "X86GenAsmWriter.inc" - -void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">"); -} - -void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); - - printInstFlags(MI, OS); - - // Output CALLpcrel32 as "callq" in 64-bit mode. - // In Intel annotation it's always emitted as "call". - // - // TODO: Probably this hack should be redesigned via InstAlias in - // InstrInfo.td as soon as Requires clause is supported properly - // for InstAlias. - if (MI->getOpcode() == X86::CALLpcrel32 && - (STI.getFeatureBits()[X86::Mode64Bit])) { - OS << "\tcallq\t"; - printPCRelImm(MI, 0, OS); - } - // data16 and data32 both have the same encoding of 0x66. While data32 is - // valid only in 16 bit systems, data16 is valid in the rest. - // There seems to be some lack of support of the Requires clause that causes - // 0x66 to be interpreted as "data16" by the asm printer. - // Thus we add an adjustment here in order to print the "right" instruction. - else if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { - OS << "\tdata32"; - } - // Try to print any aliases first. - else if (!printAliasInstr(MI, OS)) - printInstruction(MI, OS); - - // Next always print the annotation. - printAnnotation(OS, Annot); -} - -void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - } else if (Op.isImm()) { - // Print immediates as signed values. - int64_t Imm = Op.getImm(); - O << markup("<imm:") << '$' << formatImm(Imm) << markup(">"); - - // TODO: This should be in a helper function in the base class, so it can - // be used by other printers. - - // If there are no instruction-specific comments, add a comment clarifying - // the hex value of the immediate operand when it isn't in the range - // [-256,255]. - if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { - // Don't print unnecessary hex sign bits. - if (Imm == (int16_t)(Imm)) - *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); - else if (Imm == (int32_t)(Imm)) - *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm); - else - *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm); - } - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << markup("<imm:") << '$'; - Op.getExpr()->print(O, &MAI); - O << markup(">"); - } -} - -void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); - const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); - - O << markup("<mem:"); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); - - if (DispSpec.isImm()) { - int64_t DispVal = DispSpec.getImm(); - if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) - O << formatImm(DispVal); - } else { - assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); - DispSpec.getExpr()->print(O, &MAI); - } - - if (IndexReg.getReg() || BaseReg.getReg()) { - O << '('; - if (BaseReg.getReg()) - printOperand(MI, Op + X86::AddrBaseReg, O); - - if (IndexReg.getReg()) { - O << ','; - printOperand(MI, Op + X86::AddrIndexReg, O); - unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); - if (ScaleVal != 1) { - O << ',' << markup("<imm:") << ScaleVal // never printed in hex. - << markup(">"); - } - } - O << ')'; - } - - O << markup(">"); -} - -void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - O << markup("<mem:"); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - - O << "("; - printOperand(MI, Op, O); - O << ")"; - - O << markup(">"); -} - -void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - O << markup("<mem:"); - - O << "%es:("; - printOperand(MI, Op, O); - O << ")"; - - O << markup(">"); -} - -void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &DispSpec = MI->getOperand(Op); - - O << markup("<mem:"); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - - if (DispSpec.isImm()) { - O << formatImm(DispSpec.getImm()); - } else { - assert(DispSpec.isExpr() && "non-immediate displacement?"); - DispSpec.getExpr()->print(O, &MAI); - } - - O << markup(">"); -} - -void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, - raw_ostream &O) { - if (MI->getOperand(Op).isExpr()) - return printOperand(MI, Op, O); - - O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) - << markup(">"); -} - -void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - unsigned Reg = Op.getReg(); - // Override the default printing to print st(0) instead st. - if (Reg == X86::ST0) - OS << markup("<reg:") << "%st(0)" << markup(">"); - else - printRegName(OS, Reg); -} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp deleted file mode 100644 index 432cd47ae499..000000000000 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.cpp +++ /dev/null @@ -1,142 +0,0 @@ -//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes common code for rendering MCInst instances as Intel-style -// and Intel-style assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86InstPrinterCommon.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Casting.h" -#include <cstdint> -#include <cassert> - -using namespace llvm; - -void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - switch (Imm) { - default: llvm_unreachable("Invalid ssecc/avxcc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - case 0x10: O << "eq_os"; break; - case 0x11: O << "lt_oq"; break; - case 0x12: O << "le_oq"; break; - case 0x13: O << "unord_s"; break; - case 0x14: O << "neq_us"; break; - case 0x15: O << "nlt_uq"; break; - case 0x16: O << "nle_uq"; break; - case 0x17: O << "ord_s"; break; - case 0x18: O << "eq_us"; break; - case 0x19: O << "nge_uq"; break; - case 0x1a: O << "ngt_uq"; break; - case 0x1b: O << "false_os"; break; - case 0x1c: O << "neq_os"; break; - case 0x1d: O << "ge_oq"; break; - case 0x1e: O << "gt_oq"; break; - case 0x1f: O << "true_us"; break; - } -} - -void X86InstPrinterCommon::printXOPCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - switch (Imm) { - default: llvm_unreachable("Invalid xopcc argument!"); - case 0: O << "lt"; break; - case 1: O << "le"; break; - case 2: O << "gt"; break; - case 3: O << "ge"; break; - case 4: O << "eq"; break; - case 5: O << "neq"; break; - case 6: O << "false"; break; - case 7: O << "true"; break; - } -} - -void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x3; - switch (Imm) { - case 0: O << "{rn-sae}"; break; - case 1: O << "{rd-sae}"; break; - case 2: O << "{ru-sae}"; break; - case 3: O << "{rz-sae}"; break; - } -} - -/// printPCRelImm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value (e.g. for jumps and calls). In -/// Intel-style these print slightly differently than normal immediates. -/// for example, a $ is not emitted. -void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) - O << formatImm(Op.getImm()); - else { - assert(Op.isExpr() && "unknown pcrel immediate operand"); - // If a symbolic branch target was added as a constant expression then print - // that address in hex. - const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { - O << formatHex((uint64_t)Address); - } else { - // Otherwise, just print the expression. - Op.getExpr()->print(O, &MAI); - } - } -} - -void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getReg()) { - printOperand(MI, OpNo, O); - O << ':'; - } -} - -void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - uint64_t TSFlags = Desc.TSFlags; - unsigned Flags = MI->getFlags(); - - if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK)) - O << "\tlock\t"; - - if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK)) - O << "\tnotrack\t"; - - if (Flags & X86::IP_HAS_REPEAT_NE) - O << "\trepne\t"; - else if (Flags & X86::IP_HAS_REPEAT) - O << "\trep\t"; -} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp deleted file mode 100644 index b31f8ab80838..000000000000 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ /dev/null @@ -1,173 +0,0 @@ -//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file includes code for rendering MCInst instances as Intel-style -// assembly. -// -//===----------------------------------------------------------------------===// - -#include "X86IntelInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "X86InstComments.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include <cassert> -#include <cstdint> - -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#include "X86GenAsmWriter1.inc" - -void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << getRegisterName(RegNo); -} - -void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, - const MCSubtargetInfo &STI) { - printInstFlags(MI, OS); - - // In 16-bit mode, print data16 as data32. - if (MI->getOpcode() == X86::DATA16_PREFIX && - STI.getFeatureBits()[X86::Mode16Bit]) { - OS << "\tdata32"; - } else - printInstruction(MI, OS); - - // Next always print the annotation. - printAnnotation(OS, Annot); - - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - EmitAnyX86InstComments(MI, *CommentStream, MII); -} - -void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - printRegName(O, Op.getReg()); - } else if (Op.isImm()) { - O << formatImm((int64_t)Op.getImm()); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << "offset "; - Op.getExpr()->print(O, &MAI); - } -} - -void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); - const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); - - O << '['; - - bool NeedPlus = false; - if (BaseReg.getReg()) { - printOperand(MI, Op+X86::AddrBaseReg, O); - NeedPlus = true; - } - - if (IndexReg.getReg()) { - if (NeedPlus) O << " + "; - if (ScaleVal != 1) - O << ScaleVal << '*'; - printOperand(MI, Op+X86::AddrIndexReg, O); - NeedPlus = true; - } - - if (!DispSpec.isImm()) { - if (NeedPlus) O << " + "; - assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); - DispSpec.getExpr()->print(O, &MAI); - } else { - int64_t DispVal = DispSpec.getImm(); - if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { - if (NeedPlus) { - if (DispVal > 0) - O << " + "; - else { - O << " - "; - DispVal = -DispVal; - } - } - O << formatImm(DispVal); - } - } - - O << ']'; -} - -void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - O << '['; - printOperand(MI, Op, O); - O << ']'; -} - -void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, - raw_ostream &O) { - // DI accesses are always ES-based. - O << "es:["; - printOperand(MI, Op, O); - O << ']'; -} - -void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, - raw_ostream &O) { - const MCOperand &DispSpec = MI->getOperand(Op); - - // If this has a segment register, print it. - printOptionalSegReg(MI, Op + 1, O); - - O << '['; - - if (DispSpec.isImm()) { - O << formatImm(DispSpec.getImm()); - } else { - assert(DispSpec.isExpr() && "non-immediate displacement?"); - DispSpec.getExpr()->print(O, &MAI); - } - - O << ']'; -} - -void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, - raw_ostream &O) { - if (MI->getOperand(Op).isExpr()) - return MI->getOperand(Op).getExpr()->print(O, &MAI); - - O << formatImm(MI->getOperand(Op).getImm() & 0xff); -} - -void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &OS) { - const MCOperand &Op = MI->getOperand(OpNo); - unsigned Reg = Op.getReg(); - // Override the default printing to print st(0) instead st. - if (Reg == X86::ST0) - OS << "st(0)"; - else - printRegName(OS, Reg); -} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp new file mode 100644 index 000000000000..ed2ee55ff2a5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -0,0 +1,487 @@ +//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86ATTInstPrinter.h" +#include "X86BaseInfo.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cinttypes> +#include <cstdint> + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter.inc" + +void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">"); +} + +void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); + + printInstFlags(MI, OS); + + // Output CALLpcrel32 as "callq" in 64-bit mode. + // In Intel annotation it's always emitted as "call". + // + // TODO: Probably this hack should be redesigned via InstAlias in + // InstrInfo.td as soon as Requires clause is supported properly + // for InstAlias. + if (MI->getOpcode() == X86::CALLpcrel32 && + (STI.getFeatureBits()[X86::Mode64Bit])) { + OS << "\tcallq\t"; + printPCRelImm(MI, 0, OS); + } + // data16 and data32 both have the same encoding of 0x66. While data32 is + // valid only in 16 bit systems, data16 is valid in the rest. + // There seems to be some lack of support of the Requires clause that causes + // 0x66 to be interpreted as "data16" by the asm printer. + // Thus we add an adjustment here in order to print the "right" instruction. + else if (MI->getOpcode() == X86::DATA16_PREFIX && + STI.getFeatureBits()[X86::Mode16Bit]) { + OS << "\tdata32"; + } + // Try to print any aliases first. + else if (!printAliasInstr(MI, OS) && + !printVecCompareInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); +} + +bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, + raw_ostream &OS) { + if (MI->getNumOperands() == 0 || + !MI->getOperand(MI->getNumOperands() - 1).isImm()) + return false; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + + // Custom print the vector compare instructions to get the immediate + // translated into the mnemonic. + switch (MI->getOpcode()) { + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/false, OS); + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, 2, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, 2, OS); + else + printxmmwordmem(MI, 2, OS); + } else + printOperand(MI, 2, OS); + + // Skip operand 1 as its tied to the dest. + + OS << ", "; + printOperand(MI, 0, OS); + return true; + } + break; + + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + if (Imm >= 0 && Imm <= 31) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/true, OS); + + unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp--, OS); + else + printdwordmem(MI, CurOp--, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, CurOp--, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp--, OS); + else + printxmmwordmem(MI, CurOp--, OS); + } + } else { + if (Desc.TSFlags & X86II::EVEX_B) + OS << "{sae}, "; + printOperand(MI, CurOp--, OS); + } + + OS << ", "; + printOperand(MI, CurOp--, OS); + OS << ", "; + printOperand(MI, 0, OS); + if (CurOp > 0) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp--, OS); + OS << "}"; + } + + return true; + } + break; + + case X86::VPCOMBmi: case X86::VPCOMBri: + case X86::VPCOMDmi: case X86::VPCOMDri: + case X86::VPCOMQmi: case X86::VPCOMQri: + case X86::VPCOMUBmi: case X86::VPCOMUBri: + case X86::VPCOMUDmi: case X86::VPCOMUDri: + case X86::VPCOMUQmi: case X86::VPCOMUQri: + case X86::VPCOMUWmi: case X86::VPCOMUWri: + case X86::VPCOMWmi: case X86::VPCOMWri: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printVPCOMMnemonic(MI, OS); + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) + printxmmwordmem(MI, 2, OS); + else + printOperand(MI, 2, OS); + + OS << ", "; + printOperand(MI, 1, OS); + OS << ", "; + printOperand(MI, 0, OS); + return true; + } + break; + + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) { + OS << '\t'; + printVPCMPMnemonic(MI, OS); + + unsigned CurOp = (Desc.TSFlags & X86II::EVEX_K) ? 3 : 2; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit as only D and Q are supported. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp--, OS); + else + printdwordmem(MI, CurOp--, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp--, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp--, OS); + else + printxmmwordmem(MI, CurOp--, OS); + } + } else { + printOperand(MI, CurOp--, OS); + } + + OS << ", "; + printOperand(MI, CurOp--, OS); + OS << ", "; + printOperand(MI, 0, OS); + if (CurOp > 0) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp--, OS); + OS << "}"; + } + + return true; + } + break; + } + + return false; +} + +void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + // Print immediates as signed values. + int64_t Imm = Op.getImm(); + O << markup("<imm:") << '$' << formatImm(Imm) << markup(">"); + + // TODO: This should be in a helper function in the base class, so it can + // be used by other printers. + + // If there are no instruction-specific comments, add a comment clarifying + // the hex value of the immediate operand when it isn't in the range + // [-256,255]. + if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { + // Don't print unnecessary hex sign bits. + if (Imm == (int16_t)(Imm)) + *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); + else if (Imm == (int32_t)(Imm)) + *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm); + else + *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm); + } + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << markup("<imm:") << '$'; + Op.getExpr()->print(O, &MAI); + O << markup(">"); + } +} + +void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); + + O << markup("<mem:"); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); + + if (DispSpec.isImm()) { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) + O << formatImm(DispVal); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + DispSpec.getExpr()->print(O, &MAI); + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + O << '('; + if (BaseReg.getReg()) + printOperand(MI, Op + X86::AddrBaseReg, O); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op + X86::AddrIndexReg, O); + unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); + if (ScaleVal != 1) { + O << ',' << markup("<imm:") << ScaleVal // never printed in hex. + << markup(">"); + } + } + O << ')'; + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<mem:"); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + + O << "("; + printOperand(MI, Op, O); + O << ")"; + + O << markup(">"); +} + +void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<mem:"); + + O << "%es:("; + printOperand(MI, Op, O); + O << ")"; + + O << markup(">"); +} + +void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + O << markup("<mem:"); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + DispSpec.getExpr()->print(O, &MAI); + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return printOperand(MI, Op, O); + + O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) + << markup(">"); +} + +void X86ATTInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + unsigned Reg = Op.getReg(); + // Override the default printing to print st(0) instead st. + if (Reg == X86::ST0) + OS << markup("<reg:") << "%st(0)" << markup(">"); + else + printRegName(OS, Reg); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h index 584dc9c286e6..747ddd30a2d9 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -1,9 +1,8 @@ //=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H #include "X86InstPrinterCommon.h" @@ -22,11 +21,12 @@ class X86ATTInstPrinter final : public X86InstPrinterCommon { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : X86InstPrinterCommon(MAI, MII, MRI) {} + : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, const MCSubtargetInfo &STI) override; + bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); // Autogenerated by tblgen, returns true if we successfully printed an // alias. @@ -53,43 +53,28 @@ public: printMemReference(MI, OpNo, O); } - void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -136,4 +121,4 @@ private: } // end namespace llvm -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 64e6fb9f0375..54413fa1a02f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -1,9 +1,8 @@ //===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -26,18 +26,20 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -static unsigned getFixupKindLog2Size(unsigned Kind) { +static unsigned getFixupKindSize(unsigned Kind) { switch (Kind) { default: llvm_unreachable("invalid fixup kind!"); + case FK_NONE: + return 0; case FK_PCRel_1: case FK_SecRel_1: case FK_Data_1: - return 0; + return 1; case FK_PCRel_2: case FK_SecRel_2: case FK_Data_2: - return 1; + return 2; case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_relax: @@ -49,12 +51,12 @@ static unsigned getFixupKindLog2Size(unsigned Kind) { case X86::reloc_branch_4byte_pcrel: case FK_SecRel_4: case FK_Data_4: - return 2; + return 4; case FK_PCRel_8: case FK_SecRel_8: case FK_Data_8: case X86::reloc_global_offset_table8: - return 3; + return 8; } } @@ -77,6 +79,8 @@ public: return X86::NumTargetFixupKinds; } + Optional<MCFixupKind> getFixupKind(StringRef Name) const override; + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, @@ -99,11 +103,14 @@ public: return Infos[Kind - FirstTargetFixupKind]; } + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const override { - unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); + unsigned Size = getFixupKindSize(Fixup.getKind()); assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); @@ -111,7 +118,7 @@ public: // Specifically ignore overflow/underflow as long as the leakage is // limited to the lower bits. This is to remain compatible with // other assemblers. - assert(isIntN(Size * 8 + 1, Value) && + assert((Size == 0 || isIntN(Size * 8 + 1, Value)) && "Value does not fit in the Fixup field"); for (unsigned i = 0; i != Size; ++i) @@ -137,40 +144,10 @@ static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) { switch (Op) { default: return Op; - case X86::JAE_1: - return (is16BitMode) ? X86::JAE_2 : X86::JAE_4; - case X86::JA_1: - return (is16BitMode) ? X86::JA_2 : X86::JA_4; - case X86::JBE_1: - return (is16BitMode) ? X86::JBE_2 : X86::JBE_4; - case X86::JB_1: - return (is16BitMode) ? X86::JB_2 : X86::JB_4; - case X86::JE_1: - return (is16BitMode) ? X86::JE_2 : X86::JE_4; - case X86::JGE_1: - return (is16BitMode) ? X86::JGE_2 : X86::JGE_4; - case X86::JG_1: - return (is16BitMode) ? X86::JG_2 : X86::JG_4; - case X86::JLE_1: - return (is16BitMode) ? X86::JLE_2 : X86::JLE_4; - case X86::JL_1: - return (is16BitMode) ? X86::JL_2 : X86::JL_4; + case X86::JCC_1: + return (is16BitMode) ? X86::JCC_2 : X86::JCC_4; case X86::JMP_1: return (is16BitMode) ? X86::JMP_2 : X86::JMP_4; - case X86::JNE_1: - return (is16BitMode) ? X86::JNE_2 : X86::JNE_4; - case X86::JNO_1: - return (is16BitMode) ? X86::JNO_2 : X86::JNO_4; - case X86::JNP_1: - return (is16BitMode) ? X86::JNP_2 : X86::JNP_4; - case X86::JNS_1: - return (is16BitMode) ? X86::JNS_2 : X86::JNS_4; - case X86::JO_1: - return (is16BitMode) ? X86::JO_2 : X86::JO_4; - case X86::JP_1: - return (is16BitMode) ? X86::JP_2 : X86::JP_4; - case X86::JS_1: - return (is16BitMode) ? X86::JS_2 : X86::JS_4; } } @@ -266,6 +243,25 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) { return getRelaxedOpcodeBranch(Inst, is16BitMode); } +Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { + if (STI.getTargetTriple().isOSBinFormatELF()) { + if (STI.getTargetTriple().getArch() == Triple::x86_64) { + if (Name == "R_X86_64_NONE") + return FK_NONE; + } else { + if (Name == "R_386_NONE") + return FK_NONE; + } + } + return MCAsmBackend::getFixupKind(Name); +} + +bool X86AsmBackend::shouldForceRelocation(const MCAssembler &, + const MCFixup &Fixup, + const MCValue &) { + return Fixup.getKind() == FK_NONE; +} + bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const { // Branches can always be relaxed in either mode. diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index c85ce9bbd5a4..6bd6c6cac7df 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -1,9 +1,8 @@ //===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -49,7 +48,8 @@ namespace X86 { TO_NEG_INF = 1, TO_POS_INF = 2, TO_ZERO = 3, - CUR_DIRECTION = 4 + CUR_DIRECTION = 4, + NO_EXC = 8 }; /// The constants to describe instr prefixes if there are @@ -60,9 +60,46 @@ namespace X86 { IP_HAS_REPEAT_NE = 4, IP_HAS_REPEAT = 8, IP_HAS_LOCK = 16, - NO_SCHED_INFO = 32, // Don't add sched comment to the current instr because - // it was already added - IP_HAS_NOTRACK = 64 + IP_HAS_NOTRACK = 32, + IP_USE_VEX3 = 64, + }; + + enum OperandType : unsigned { + /// AVX512 embedded rounding control. This should only have values 0-3. + OPERAND_ROUNDING_CONTROL = MCOI::OPERAND_FIRST_TARGET, + OPERAND_COND_CODE, + }; + + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_O = 0, + COND_NO = 1, + COND_B = 2, + COND_AE = 3, + COND_E = 4, + COND_NE = 5, + COND_BE = 6, + COND_A = 7, + COND_S = 8, + COND_NS = 9, + COND_P = 10, + COND_NP = 11, + COND_L = 12, + COND_GE = 13, + COND_LE = 14, + COND_G = 15, + LAST_VALID_COND = COND_G, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches that together + // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs and are inverses of one another. + COND_NE_OR_P, + COND_E_AND_NP, + + COND_INVALID }; } // end namespace X86; @@ -285,6 +322,10 @@ namespace X86II { /// manual, this operand is described as pntr16:32 and pntr16:16 RawFrmImm16 = 8, + /// AddCCFrm - This form is used for Jcc that encode the condition code + /// in the lower 4 bits of the opcode. + AddCCFrm = 9, + /// MRM[0-7][rm] - These forms are used to represent instructions that use /// a Mod/RM byte, and use the middle field to hold extended opcode /// information. In the intel manual these are represented as /0, /1, ... @@ -310,10 +351,21 @@ namespace X86II { /// MRMSrcMemOp4 = 35, + /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM + /// byte to specify the operands and also encodes a condition code. + /// + MRMSrcMemCC = 36, + + /// MRMXm - This form is used for instructions that use the Mod/RM byte + /// to specify a memory source, but doesn't use the middle field. And has + /// a condition code. + /// + MRMXmCC = 38, + /// MRMXm - This form is used for instructions that use the Mod/RM byte /// to specify a memory source, but doesn't use the middle field. /// - MRMXm = 39, // Instruction that uses Mod/RM but not the middle field. + MRMXm = 39, // Next, instructions that operate on a memory r/m operand... MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3 @@ -339,10 +391,21 @@ namespace X86II { /// MRMSrcRegOp4 = 51, + /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM + /// byte to specify the operands and also encodes a condition code + /// + MRMSrcRegCC = 52, + + /// MRMXCCr - This form is used for instructions that use the Mod/RM byte + /// to specify a register source, but doesn't use the middle field. And has + /// a condition code. + /// + MRMXrCC = 54, + /// MRMXr - This form is used for instructions that use the Mod/RM byte /// to specify a register source, but doesn't use the middle field. /// - MRMXr = 55, // Instruction that uses Mod/RM but not the middle field. + MRMXr = 55, // Instructions that operate on a register r/m operand... MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3 @@ -681,8 +744,7 @@ namespace X86II { // has it as the last op. if (NumOps == 9 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && (Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1 || - Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1) && - "Instruction with 2 defs isn't gather?") + Desc.getOperandConstraint(8, MCOI::TIED_TO) == 1)) return 2; return 0; } @@ -711,6 +773,7 @@ namespace X86II { case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: + case X86II::AddCCFrm: return -1; case X86II::MRMDestMem: return 0; @@ -724,16 +787,23 @@ namespace X86II { case X86II::MRMSrcMemOp4: // Skip registers encoded in reg, VEX_VVVV, and I8IMM. return 3; + case X86II::MRMSrcMemCC: + // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a + // mask register. + return 1; case X86II::MRMDestReg: case X86II::MRMSrcReg: case X86II::MRMSrcReg4VOp3: case X86II::MRMSrcRegOp4: + case X86II::MRMSrcRegCC: + case X86II::MRMXrCC: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: return -1; + case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index b724a89f81d2..232a06593238 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -45,7 +44,7 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, (EMachine != ELF::EM_386) && (EMachine != ELF::EM_IAMCU)) {} -enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; +enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; static X86_64RelType getType64(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, @@ -53,6 +52,8 @@ static X86_64RelType getType64(unsigned Kind, switch (Kind) { default: llvm_unreachable("Unimplemented"); + case FK_NONE: + return RT64_NONE; case X86::reloc_global_offset_table8: Modifier = MCSymbolRefExpr::VK_GOT; IsPCRel = true; @@ -103,6 +104,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case MCSymbolRefExpr::VK_None: case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { + case RT64_NONE: + if (Modifier == MCSymbolRefExpr::VK_None) + return ELF::R_X86_64_NONE; + llvm_unreachable("Unimplemented"); case RT64_64: return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; case RT64_32: @@ -114,6 +119,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_8: return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOT: switch (Type) { case RT64_64: @@ -123,8 +129,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOTOFF: assert(Type == RT64_64); assert(!IsPCRel); @@ -139,8 +147,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_DTPOFF: assert(!IsPCRel); switch (Type) { @@ -151,8 +161,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_SIZE: assert(!IsPCRel); switch (Type) { @@ -163,8 +175,10 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case RT64_32S: case RT64_16: case RT64_8: + case RT64_NONE: llvm_unreachable("Unimplemented"); } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_TLSCALL: return ELF::R_X86_64_TLSDESC_CALL; case MCSymbolRefExpr::VK_TLSDESC: @@ -197,13 +211,16 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case X86::reloc_riprel_4byte_movq_load: return ELF::R_X86_64_REX_GOTPCRELX; } + llvm_unreachable("unexpected relocation type!"); } } -enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; +enum X86_32RelType { RT32_NONE, RT32_32, RT32_16, RT32_8 }; static X86_32RelType getType32(X86_64RelType T) { switch (T) { + case RT64_NONE: + return RT32_NONE; case RT64_64: llvm_unreachable("Unimplemented"); case RT64_32: @@ -227,6 +244,10 @@ static unsigned getRelocType32(MCContext &Ctx, case MCSymbolRefExpr::VK_None: case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { + case RT32_NONE: + if (Modifier == MCSymbolRefExpr::VK_None) + return ELF::R_386_NONE; + llvm_unreachable("Unimplemented"); case RT32_32: return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; case RT32_16: @@ -234,6 +255,7 @@ static unsigned getRelocType32(MCContext &Ctx, case RT32_8: return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; } + llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOT: assert(Type == RT32_32); if (IsPCRel) @@ -249,6 +271,10 @@ static unsigned getRelocType32(MCContext &Ctx, assert(Type == RT32_32); assert(!IsPCRel); return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TLSCALL: + return ELF::R_386_TLS_DESC_CALL; + case MCSymbolRefExpr::VK_TLSDESC: + return ELF::R_386_TLS_GOTDESC; case MCSymbolRefExpr::VK_TPOFF: assert(Type == RT32_32); assert(!IsPCRel); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 3c04b13e002e..2d5217115d07 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -1,9 +1,8 @@ //===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 37bed37b0994..73b1969b4e82 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -1,9 +1,8 @@ //===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,8 +13,8 @@ #include "X86InstComments.h" #include "X86ATTInstPrinter.h" -#include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86BaseInfo.h" +#include "X86MCTargetDesc.h" #include "Utils/X86ShuffleDecode.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -1076,9 +1075,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); LLVM_FALLTHROUGH; + case X86::MOVSDrm_alt: case X86::MOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVSDrm: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1091,8 +1093,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, LLVM_FALLTHROUGH; case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1203,7 +1208,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBW, m) - DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), ShuffleMask); + DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1211,7 +1217,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBD, m) - DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); + DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1219,7 +1226,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXBQ, m) - DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1227,7 +1235,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXWD, m) - DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), ShuffleMask); + DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1235,7 +1244,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXWQ, m) - DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1243,7 +1253,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; CASE_PMOVZX(PMOVZXDQ, m) - DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), ShuffleMask); + DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false, + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; } @@ -1304,6 +1315,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, OS << ']'; --i; // For loop increments element #. } + OS << '\n'; // We successfully added a comment to this instruction. return true; diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h index 40dffa5fbb8a..96760664012a 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.h @@ -1,9 +1,8 @@ //=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTCOMMENTS_H namespace llvm { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp new file mode 100644 index 000000000000..a21555076976 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -0,0 +1,362 @@ +//===--- X86InstPrinterCommon.cpp - X86 assembly instruction printing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes common code for rendering MCInst instances as Intel-style +// and Intel-style assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86InstPrinterCommon.h" +#include "X86BaseInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include <cstdint> +#include <cassert> + +using namespace llvm; + +void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid condcode argument!"); + case 0: O << "o"; break; + case 1: O << "no"; break; + case 2: O << "b"; break; + case 3: O << "ae"; break; + case 4: O << "e"; break; + case 5: O << "ne"; break; + case 6: O << "be"; break; + case 7: O << "a"; break; + case 8: O << "s"; break; + case 9: O << "ns"; break; + case 0xa: O << "p"; break; + case 0xb: O << "np"; break; + case 0xc: O << "l"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "le"; break; + case 0xf: O << "g"; break; + } +} + +void X86InstPrinterCommon::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid ssecc/avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +void X86InstPrinterCommon::printVPCOMMnemonic(const MCInst *MI, + raw_ostream &OS) { + OS << "vpcom"; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid vpcom argument!"); + case 0: OS << "lt"; break; + case 1: OS << "le"; break; + case 2: OS << "gt"; break; + case 3: OS << "ge"; break; + case 4: OS << "eq"; break; + case 5: OS << "neq"; break; + case 6: OS << "false"; break; + case 7: OS << "true"; break; + } + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::VPCOMBmi: case X86::VPCOMBri: OS << "b\t"; break; + case X86::VPCOMDmi: case X86::VPCOMDri: OS << "d\t"; break; + case X86::VPCOMQmi: case X86::VPCOMQri: OS << "q\t"; break; + case X86::VPCOMUBmi: case X86::VPCOMUBri: OS << "ub\t"; break; + case X86::VPCOMUDmi: case X86::VPCOMUDri: OS << "ud\t"; break; + case X86::VPCOMUQmi: case X86::VPCOMUQri: OS << "uq\t"; break; + case X86::VPCOMUWmi: case X86::VPCOMUWri: OS << "uw\t"; break; + case X86::VPCOMWmi: case X86::VPCOMWri: OS << "w\t"; break; + } +} + +void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI, + raw_ostream &OS) { + OS << "vpcmp"; + + printSSEAVXCC(MI, MI->getNumOperands() - 1, OS); + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + OS << "b\t"; + break; + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + OS << "d\t"; + break; + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + OS << "q\t"; + break; + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + OS << "ub\t"; + break; + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + OS << "ud\t"; + break; + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + OS << "uq\t"; + break; + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rri: case X86::VPCMPUWZ256rmi: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rrik: case X86::VPCMPUWZ256rmik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + OS << "uw\t"; + break; + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + OS << "w\t"; + break; + } +} + +void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, + raw_ostream &OS) { + OS << (IsVCmp ? "vcmp" : "cmp"); + + printSSEAVXCC(MI, MI->getNumOperands() - 1, OS); + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + OS << "pd\t"; + break; + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + OS << "ps\t"; + break; + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + OS << "sd\t"; + break; + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + OS << "ss\t"; + break; + } +} + +void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: + llvm_unreachable("Invalid rounding control!"); + case X86::TO_NEAREST_INT: + O << "{rn-sae}"; + break; + case X86::TO_NEG_INF: + O << "{rd-sae}"; + break; + case X86::TO_POS_INF: + O << "{ru-sae}"; + break; + case X86::TO_ZERO: + O << "{rz-sae}"; + break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value (e.g. for jumps and calls). In +/// Intel-style these print slightly differently than normal immediates. +/// for example, a $ is not emitted. +void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << formatHex((uint64_t)Address); + } else { + // Otherwise, just print the expression. + Op.getExpr()->print(O, &MAI); + } + } +} + +void X86InstPrinterCommon::printOptionalSegReg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getReg()) { + printOperand(MI, OpNo, O); + O << ':'; + } +} + +void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + unsigned Flags = MI->getFlags(); + + if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK)) + O << "\tlock\t"; + + if ((TSFlags & X86II::NOTRACK) || (Flags & X86::IP_HAS_NOTRACK)) + O << "\tnotrack\t"; + + if (Flags & X86::IP_HAS_REPEAT_NE) + O << "\trepne\t"; + else if (Flags & X86::IP_HAS_REPEAT) + O << "\trep\t"; +} + +void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + // In assembly listings, a pair is represented by one of its members, any + // of the two. Here, we pick k0, k2, k4, k6, but we could as well + // print K2_K3 as "k3". It would probably make a lot more sense, if + // the assembly would look something like: + // "vp2intersect %zmm5, %zmm7, {%k2, %k3}" + // but this can work too. + switch (MI->getOperand(OpNo).getReg()) { + case X86::K0_K1: + printRegName(OS, X86::K0); + return; + case X86::K2_K3: + printRegName(OS, X86::K2); + return; + case X86::K4_K5: + printRegName(OS, X86::K4); + return; + case X86::K6_K7: + printRegName(OS, X86::K6); + return; + } + llvm_unreachable("Unknown mask pair register name"); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index f2875e71f22c..8e28f24b619a 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstPrinterCommon.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -1,9 +1,8 @@ //===-- X86InstPrinterCommon.cpp - X86 assembly instruction printing ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTPRINTERCOMMON_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H #include "llvm/MC/MCInstPrinter.h" @@ -24,15 +23,19 @@ public: using MCInstPrinter::MCInstPrinter; virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0; + void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS); void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printVPCOMMnemonic(const MCInst *MI, raw_ostream &OS); + void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS); + void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); protected: void printInstFlags(const MCInst *MI, raw_ostream &O); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp new file mode 100644 index 000000000000..ea28bef42569 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -0,0 +1,445 @@ +//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as Intel-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86IntelInstPrinter.h" +#include "X86BaseInfo.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +#include <cstdint> + +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter1.inc" + +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, + const MCSubtargetInfo &STI) { + printInstFlags(MI, OS); + + // In 16-bit mode, print data16 as data32. + if (MI->getOpcode() == X86::DATA16_PREFIX && + STI.getFeatureBits()[X86::Mode16Bit]) { + OS << "\tdata32"; + } else if (!printAliasInstr(MI, OS) && + !printVecCompareInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + EmitAnyX86InstComments(MI, *CommentStream, MII); +} + +bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS) { + if (MI->getNumOperands() == 0 || + !MI->getOperand(MI->getNumOperands() - 1).isImm()) + return false; + + int64_t Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + + // Custom print the vector compare instructions to get the immediate + // translated into the mnemonic. + switch (MI->getOpcode()) { + case X86::CMPPDrmi: case X86::CMPPDrri: + case X86::CMPPSrmi: case X86::CMPPSrri: + case X86::CMPSDrm: case X86::CMPSDrr: + case X86::CMPSDrm_Int: case X86::CMPSDrr_Int: + case X86::CMPSSrm: case X86::CMPSSrr: + case X86::CMPSSrm_Int: case X86::CMPSSrr_Int: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/false, OS); + printOperand(MI, 0, OS); + OS << ", "; + // Skip operand 1 as its tied to the dest. + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, 2, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, 2, OS); + else + printxmmwordmem(MI, 2, OS); + } else + printOperand(MI, 2, OS); + + return true; + } + break; + + case X86::VCMPPDrmi: case X86::VCMPPDrri: + case X86::VCMPPDYrmi: case X86::VCMPPDYrri: + case X86::VCMPPDZ128rmi: case X86::VCMPPDZ128rri: + case X86::VCMPPDZ256rmi: case X86::VCMPPDZ256rri: + case X86::VCMPPDZrmi: case X86::VCMPPDZrri: + case X86::VCMPPSrmi: case X86::VCMPPSrri: + case X86::VCMPPSYrmi: case X86::VCMPPSYrri: + case X86::VCMPPSZ128rmi: case X86::VCMPPSZ128rri: + case X86::VCMPPSZ256rmi: case X86::VCMPPSZ256rri: + case X86::VCMPPSZrmi: case X86::VCMPPSZrri: + case X86::VCMPSDrm: case X86::VCMPSDrr: + case X86::VCMPSDZrm: case X86::VCMPSDZrr: + case X86::VCMPSDrm_Int: case X86::VCMPSDrr_Int: + case X86::VCMPSDZrm_Int: case X86::VCMPSDZrr_Int: + case X86::VCMPSSrm: case X86::VCMPSSrr: + case X86::VCMPSSZrm: case X86::VCMPSSZrr: + case X86::VCMPSSrm_Int: case X86::VCMPSSrr_Int: + case X86::VCMPSSZrm_Int: case X86::VCMPSSZrr_Int: + case X86::VCMPPDZ128rmik: case X86::VCMPPDZ128rrik: + case X86::VCMPPDZ256rmik: case X86::VCMPPDZ256rrik: + case X86::VCMPPDZrmik: case X86::VCMPPDZrrik: + case X86::VCMPPSZ128rmik: case X86::VCMPPSZ128rrik: + case X86::VCMPPSZ256rmik: case X86::VCMPPSZ256rrik: + case X86::VCMPPSZrmik: case X86::VCMPPSZrrik: + case X86::VCMPSDZrm_Intk: case X86::VCMPSDZrr_Intk: + case X86::VCMPSSZrm_Intk: case X86::VCMPSSZrr_Intk: + case X86::VCMPPDZ128rmbi: case X86::VCMPPDZ128rmbik: + case X86::VCMPPDZ256rmbi: case X86::VCMPPDZ256rmbik: + case X86::VCMPPDZrmbi: case X86::VCMPPDZrmbik: + case X86::VCMPPSZ128rmbi: case X86::VCMPPSZ128rmbik: + case X86::VCMPPSZ256rmbi: case X86::VCMPPSZ256rmbik: + case X86::VCMPPSZrmbi: case X86::VCMPPSZrmbik: + case X86::VCMPPDZrrib: case X86::VCMPPDZrribk: + case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: + case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: + case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + if (Imm >= 0 && Imm <= 31) { + OS << '\t'; + printCMPMnemonic(MI, /*IsVCMP*/true, OS); + + unsigned CurOp = 0; + printOperand(MI, CurOp++, OS); + + if (Desc.TSFlags & X86II::EVEX_K) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp++, OS); + OS << "}"; + } + OS << ", "; + printOperand(MI, CurOp++, OS); + OS << ", "; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp++, OS); + else + printdwordmem(MI, CurOp++, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) + printdwordmem(MI, CurOp++, OS); + else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + printqwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp++, OS); + else + printxmmwordmem(MI, CurOp++, OS); + } + } else { + printOperand(MI, CurOp++, OS); + if (Desc.TSFlags & X86II::EVEX_B) + OS << ", {sae}"; + } + + return true; + } + break; + + case X86::VPCOMBmi: case X86::VPCOMBri: + case X86::VPCOMDmi: case X86::VPCOMDri: + case X86::VPCOMQmi: case X86::VPCOMQri: + case X86::VPCOMUBmi: case X86::VPCOMUBri: + case X86::VPCOMUDmi: case X86::VPCOMUDri: + case X86::VPCOMUQmi: case X86::VPCOMUQri: + case X86::VPCOMUWmi: case X86::VPCOMUWri: + case X86::VPCOMWmi: case X86::VPCOMWri: + if (Imm >= 0 && Imm <= 7) { + OS << '\t'; + printVPCOMMnemonic(MI, OS); + printOperand(MI, 0, OS); + OS << ", "; + printOperand(MI, 1, OS); + OS << ", "; + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) + printxmmwordmem(MI, 2, OS); + else + printOperand(MI, 2, OS); + return true; + } + break; + + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rri: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rri: + case X86::VPCMPBZrmi: case X86::VPCMPBZrri: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rri: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rri: + case X86::VPCMPDZrmi: case X86::VPCMPDZrri: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rri: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rri: + case X86::VPCMPQZrmi: case X86::VPCMPQZrri: + case X86::VPCMPUBZ128rmi: case X86::VPCMPUBZ128rri: + case X86::VPCMPUBZ256rmi: case X86::VPCMPUBZ256rri: + case X86::VPCMPUBZrmi: case X86::VPCMPUBZrri: + case X86::VPCMPUDZ128rmi: case X86::VPCMPUDZ128rri: + case X86::VPCMPUDZ256rmi: case X86::VPCMPUDZ256rri: + case X86::VPCMPUDZrmi: case X86::VPCMPUDZrri: + case X86::VPCMPUQZ128rmi: case X86::VPCMPUQZ128rri: + case X86::VPCMPUQZ256rmi: case X86::VPCMPUQZ256rri: + case X86::VPCMPUQZrmi: case X86::VPCMPUQZrri: + case X86::VPCMPUWZ128rmi: case X86::VPCMPUWZ128rri: + case X86::VPCMPUWZ256rmi: case X86::VPCMPUWZ256rri: + case X86::VPCMPUWZrmi: case X86::VPCMPUWZrri: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rri: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rri: + case X86::VPCMPWZrmi: case X86::VPCMPWZrri: + case X86::VPCMPBZ128rmik: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmik: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmik: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmik: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmik: case X86::VPCMPQZrrik: + case X86::VPCMPUBZ128rmik: case X86::VPCMPUBZ128rrik: + case X86::VPCMPUBZ256rmik: case X86::VPCMPUBZ256rrik: + case X86::VPCMPUBZrmik: case X86::VPCMPUBZrrik: + case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik: + case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik: + case X86::VPCMPUDZrmik: case X86::VPCMPUDZrrik: + case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik: + case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik: + case X86::VPCMPUQZrmik: case X86::VPCMPUQZrrik: + case X86::VPCMPUWZ128rmik: case X86::VPCMPUWZ128rrik: + case X86::VPCMPUWZ256rmik: case X86::VPCMPUWZ256rrik: + case X86::VPCMPUWZrmik: case X86::VPCMPUWZrrik: + case X86::VPCMPWZ128rmik: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmik: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmik: case X86::VPCMPWZrrik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk: + case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk: + case X86::VPCMPUDZrmib: case X86::VPCMPUDZrmibk: + case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk: + case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk: + case X86::VPCMPUQZrmib: case X86::VPCMPUQZrmibk: + if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) { + OS << '\t'; + printVPCMPMnemonic(MI, OS); + + unsigned CurOp = 0; + printOperand(MI, CurOp++, OS); + + if (Desc.TSFlags & X86II::EVEX_K) { + // Print mask operand. + OS << " {"; + printOperand(MI, CurOp++, OS); + OS << "}"; + } + OS << ", "; + printOperand(MI, CurOp++, OS); + OS << ", "; + + if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { + if (Desc.TSFlags & X86II::EVEX_B) { + // Broadcast form. + // Load size is based on W-bit as only D and Q are supported. + if (Desc.TSFlags & X86II::VEX_W) + printqwordmem(MI, CurOp++, OS); + else + printdwordmem(MI, CurOp++, OS); + + // Print the number of elements broadcasted. + unsigned NumElts; + if (Desc.TSFlags & X86II::EVEX_L2) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 8 : 16; + else if (Desc.TSFlags & X86II::VEX_L) + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; + else + NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + OS << "{1to" << NumElts << "}"; + } else { + if (Desc.TSFlags & X86II::EVEX_L2) + printzmmwordmem(MI, CurOp++, OS); + else if (Desc.TSFlags & X86II::VEX_L) + printymmwordmem(MI, CurOp++, OS); + else + printxmmwordmem(MI, CurOp++, OS); + } + } else { + printOperand(MI, CurOp++, OS); + } + + return true; + } + break; + } + + return false; +} + +void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + O << formatImm((int64_t)Op.getImm()); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << "offset "; + Op.getExpr()->print(O, &MAI); + } +} + +void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + X86::AddrSegmentReg, O); + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op+X86::AddrBaseReg, O); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+X86::AddrIndexReg, O); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + DispSpec.getExpr()->print(O, &MAI); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << formatImm(DispVal); + } + } + + O << ']'; +} + +void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + O << '['; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + // DI accesses are always ES-based. + O << "es:["; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + // If this has a segment register, print it. + printOptionalSegReg(MI, Op + 1, O); + + O << '['; + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + DispSpec.getExpr()->print(O, &MAI); + } + + O << ']'; +} + +void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return MI->getOperand(Op).getExpr()->print(O, &MAI); + + O << formatImm(MI->getOperand(Op).getImm() & 0xff); +} + +void X86IntelInstPrinter::printSTiRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + const MCOperand &Op = MI->getOperand(OpNo); + unsigned Reg = Op.getReg(); + // Override the default printing to print st(0) instead st. + if (Reg == X86::ST0) + OS << "st(0)"; + else + printRegName(OS, Reg); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h index fe52bd482a26..f32f49f7c417 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h @@ -1,9 +1,8 @@ //= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H -#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H #include "X86InstPrinterCommon.h" #include "llvm/Support/raw_ostream.h" @@ -28,6 +27,13 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, const MCSubtargetInfo &STI) override; + bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); @@ -49,58 +55,38 @@ public: printMemReference(MI, OpNo, O); } - void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "byte ptr "; printMemReference(MI, OpNo, O); } - void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "word ptr "; printMemReference(MI, OpNo, O); } - void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printdwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "dword ptr "; printMemReference(MI, OpNo, O); } - void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printqwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "qword ptr "; printMemReference(MI, OpNo, O); } - void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printxmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "xmmword ptr "; printMemReference(MI, OpNo, O); } - void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printymmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "ymmword ptr "; printMemReference(MI, OpNo, O); } - void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printzmmwordmem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "zmmword ptr "; printMemReference(MI, OpNo, O); } - void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "dword ptr "; - printMemReference(MI, OpNo, O); - } - void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "qword ptr "; - printMemReference(MI, OpNo, O); - } - void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + void printtbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "tbyte ptr "; printMemReference(MI, OpNo, O); } - void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "xmmword ptr "; - printMemReference(MI, OpNo, O); - } - void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "ymmword ptr "; - printMemReference(MI, OpNo, O); - } - void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "zmmword ptr "; - printMemReference(MI, OpNo, O); - } void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { @@ -155,4 +141,4 @@ public: } // end namespace llvm -#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INTELINSTPRINTER_H diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index fa7c352a1b63..e1125c176b25 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -1,9 +1,8 @@ //===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index 30d5c802d1ed..b2369647a40f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -1,9 +1,8 @@ //===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index f5371db9e77a..31d26d08a63f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -525,9 +524,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // indirect register encoding, this handles addresses like [EAX]. The // encoding for [EBP] with no displacement means [disp32] so we handle it // by emitting a displacement of 0 below. - if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) { - EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); - return; + if (BaseRegNo != N86::EBP) { + if (Disp.isImm() && Disp.getImm() == 0) { + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + + // If the displacement is @tlscall, treat it as a zero. + if (Disp.isExpr()) { + auto *Sym = dyn_cast<MCSymbolRefExpr>(Disp.getExpr()); + if (Sym && Sym->getKind() == MCSymbolRefExpr::VK_TLSCALL) { + // This is exclusively used by call *a@tlscall(base). The relocation + // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning. + Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc())); + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + } } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. @@ -880,7 +893,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (HasEVEX_RC) { unsigned RcOperand = NumOps-1; assert(RcOperand >= CurOp); - EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3; + EVEX_rc = MI.getOperand(RcOperand).getImm(); + assert(EVEX_rc <= 3 && "Invalid rounding control!"); } EncodeRC = true; } @@ -979,7 +993,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); // Can we use the 2 byte VEX prefix? - if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { + if (!(MI.getFlags() & X86::IP_USE_VEX3) && + Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { EmitByte(0xC5, CurByte, OS); EmitByte(LastByte | (VEX_R << 7), CurByte, OS); return; @@ -1060,16 +1075,17 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; case X86II::MRMSrcReg: + case X86II::MRMSrcRegCC: REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; - case X86II::MRMSrcMem: { + case X86II::MRMSrcMem: + case X86II::MRMSrcMemCC: REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; break; - } case X86II::MRMDestReg: REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R @@ -1080,7 +1096,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, CurOp += X86::AddrNumOperands; REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; - case X86II::MRMXm: + case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: @@ -1088,7 +1104,7 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X break; - case X86II::MRMXr: + case X86II::MRMXrCC: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: @@ -1272,6 +1288,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow) BaseOpcode = 0x0F; // Weird 3DNow! encoding. + unsigned OpcodeOffset = 0; + uint64_t Form = TSFlags & X86II::FormMask; switch (Form) { default: errs() << "FORM: " << Form << "\n"; @@ -1318,8 +1336,14 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); break; } - case X86II::RawFrm: { - EmitByte(BaseOpcode, CurByte, OS); + case X86II::AddCCFrm: { + // This will be added to the opcode in the fallthrough. + OpcodeOffset = MI.getOperand(NumOps - 1).getImm(); + assert(OpcodeOffset < 16 && "Unexpected opcode offset!"); + --NumOps; // Drop the operand from the end. + LLVM_FALLTHROUGH; + case X86II::RawFrm: + EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS); if (!is64BitMode(STI) || !isPCRel32Branch(MI)) break; @@ -1436,6 +1460,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = SrcRegNum + 1; break; } + case X86II::MRMSrcRegCC: { + unsigned FirstOp = CurOp++; + unsigned SecondOp = CurOp++; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + EmitRegModRMByte(MI.getOperand(SecondOp), + GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS); + break; + } case X86II::MRMSrcMem: { unsigned FirstMemOp = CurOp+1; @@ -1481,6 +1516,27 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; break; } + case X86II::MRMSrcMemCC: { + unsigned RegOp = CurOp++; + unsigned FirstMemOp = CurOp; + CurOp = FirstMemOp + X86::AddrNumOperands; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)), + TSFlags, Rex, CurByte, OS, Fixups, STI); + break; + } + + case X86II::MRMXrCC: { + unsigned RegOp = CurOp++; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS); + break; + } case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: @@ -1497,6 +1553,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurByte, OS); break; + case X86II::MRMXmCC: { + unsigned FirstMemOp = CurOp; + CurOp = FirstMemOp + X86::AddrNumOperands; + + unsigned CC = MI.getOperand(CurOp++).getImm(); + EmitByte(BaseOpcode + CC, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI); + break; + } + case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h index 1070f70468fa..532fecd9951b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCExpr.h @@ -1,9 +1,8 @@ //=--- X86MCExpr.h - X86 specific MC expression classes ---*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCEXPR_H -#include "InstPrinter/X86ATTInstPrinter.h" +#include "X86ATTInstPrinter.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ea4aaf14223d..ce05ad974507 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,13 +11,15 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86IntelInstPrinter.h" +#include "TargetInfo/X86TargetInfo.h" +#include "X86ATTInstPrinter.h" #include "X86BaseInfo.h" +#include "X86IntelInstPrinter.h" #include "X86MCAsmInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -117,6 +118,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::ST6, X86::FP6}, {codeview::RegisterId::ST7, X86::FP7}, + {codeview::RegisterId::MM0, X86::MM0}, + {codeview::RegisterId::MM1, X86::MM1}, + {codeview::RegisterId::MM2, X86::MM2}, + {codeview::RegisterId::MM3, X86::MM3}, + {codeview::RegisterId::MM4, X86::MM4}, + {codeview::RegisterId::MM5, X86::MM5}, + {codeview::RegisterId::MM6, X86::MM6}, + {codeview::RegisterId::MM7, X86::MM7}, + {codeview::RegisterId::XMM0, X86::XMM0}, {codeview::RegisterId::XMM1, X86::XMM1}, {codeview::RegisterId::XMM2, X86::XMM2}, diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 4e9f5ba60d2e..00dd5908cbf5 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -1,9 +1,8 @@ //===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -35,9 +34,6 @@ class StringRef; class raw_ostream; class raw_pwrite_stream; -Target &getTheX86_32Target(); -Target &getTheX86_64Target(); - /// Flavour of dwarf regnumbers /// namespace DWARFFlavour { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 883278b7bc1f..fc7e99f61e5e 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h index 10a282dd2962..3b1e9e7c34fb 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h @@ -1,9 +1,8 @@ //===- X86TargetStreamer.h ------------------------------*- C++ -*---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 2aec695b2dbf..3baab9da1c41 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 0085787e576a..796a27a17255 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index bee9b7046338..e9987d1f62bd 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp b/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp deleted file mode 100644 index ab2cebcb58ee..000000000000 --- a/contrib/llvm/lib/Target/X86/ShadowCallStack.cpp +++ /dev/null @@ -1,322 +0,0 @@ -//===------- ShadowCallStack.cpp - Shadow Call Stack pass -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// The ShadowCallStack pass instruments function prologs/epilogs to check that -// the return address has not been corrupted during the execution of the -// function. The return address is stored in a 'shadow call stack' addressed -// using the %gs segment register. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86Subtarget.h" - -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class ShadowCallStack : public MachineFunctionPass { -public: - static char ID; - - ShadowCallStack() : MachineFunctionPass(ID) { - initializeShadowCallStackPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - -private: - // Do not instrument leaf functions with this many or fewer instructions. The - // shadow call stack instrumented prolog/epilog are slightly race-y reading - // and checking the saved return address, so it is better to not instrument - // functions that have fewer instructions than the instrumented prolog/epilog - // race. - static const size_t SkipLeafInstructions = 3; -}; - -char ShadowCallStack::ID = 0; -} // end anonymous namespace. - -static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL); -static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL, - MCPhysReg FreeRegister); - -static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB); -static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB, - MCPhysReg FreeRegister); -// Generate a longer epilog that only uses r10 when a tailcall branches to r11. -static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB); - -// Helper function to add ModR/M references for [Seg: Reg + Offset] memory -// accesses -static inline const MachineInstrBuilder & -addSegmentedMem(const MachineInstrBuilder &MIB, MCPhysReg Seg, MCPhysReg Reg, - int Offset = 0) { - return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset).addReg(Seg); -} - -static void addProlog(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL) { - const MCPhysReg ReturnReg = X86::R10; - const MCPhysReg OffsetReg = X86::R11; - - auto MBBI = MBB.begin(); - // mov r10, [rsp] - addDirectMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(ReturnReg), - X86::RSP); - // xor r11, r11 - BuildMI(MBB, MBBI, DL, TII->get(X86::XOR64rr)) - .addDef(OffsetReg) - .addReg(OffsetReg, RegState::Undef) - .addReg(OffsetReg, RegState::Undef); - // add QWORD [gs:r11], 8 - addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64mi8)), X86::GS, - OffsetReg) - .addImm(8); - // mov r11, [gs:r11] - addSegmentedMem( - BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64rm)).addDef(OffsetReg), X86::GS, - OffsetReg); - // mov [gs:r11], r10 - addSegmentedMem(BuildMI(MBB, MBBI, DL, TII->get(X86::MOV64mr)), X86::GS, - OffsetReg) - .addReg(ReturnReg); -} - -static void addPrologLeaf(MachineFunction &Fn, const TargetInstrInfo *TII, - MachineBasicBlock &MBB, const DebugLoc &DL, - MCPhysReg FreeRegister) { - // mov REG, [rsp] - addDirectMem(BuildMI(MBB, MBB.begin(), DL, TII->get(X86::MOV64rm)) - .addDef(FreeRegister), - X86::RSP); -} - -static void addEpilog(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB) { - const DebugLoc &DL = MI.getDebugLoc(); - - // xor r11, r11 - BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr)) - .addDef(X86::R11) - .addReg(X86::R11, RegState::Undef) - .addReg(X86::R11, RegState::Undef); - // mov r10, [gs:r11] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R11); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // sub QWORD [gs:r11], 8 - // This instruction should not be moved up to avoid a signal race. - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), - X86::GS, X86::R11) - .addImm(8); - // cmp [rsp], r10 - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(X86::R10); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -static void addEpilogLeaf(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB, - MCPhysReg FreeRegister) { - const DebugLoc &DL = MI.getDebugLoc(); - - // cmp [rsp], REG - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(FreeRegister); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -static void addEpilogOnlyR10(const TargetInstrInfo *TII, MachineBasicBlock &MBB, - MachineInstr &MI, MachineBasicBlock &TrapBB) { - const DebugLoc &DL = MI.getDebugLoc(); - - // xor r10, r10 - BuildMI(MBB, MI, DL, TII->get(X86::XOR64rr)) - .addDef(X86::R10) - .addReg(X86::R10, RegState::Undef) - .addReg(X86::R10, RegState::Undef); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // mov r10, [gs:r10] - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm)).addDef(X86::R10), - X86::GS, X86::R10); - // sub QWORD [gs:0], 8 - // This instruction should not be moved up to avoid a signal race. - addSegmentedMem(BuildMI(MBB, MI, DL, TII->get(X86::SUB64mi8)), X86::GS, 0) - .addImm(8); - // cmp [rsp], r10 - addDirectMem(BuildMI(MBB, MI, DL, TII->get(X86::CMP64mr)), X86::RSP) - .addReg(X86::R10); - // jne trap - BuildMI(MBB, MI, DL, TII->get(X86::JNE_1)).addMBB(&TrapBB); - MBB.addSuccessor(&TrapBB); -} - -bool ShadowCallStack::runOnMachineFunction(MachineFunction &Fn) { - if (!Fn.getFunction().hasFnAttribute(Attribute::ShadowCallStack) || - Fn.getFunction().hasFnAttribute(Attribute::Naked)) - return false; - - if (Fn.empty() || !Fn.getRegInfo().tracksLiveness()) - return false; - - // FIXME: Skip functions that have r10 or r11 live on entry (r10 can be live - // on entry for parameters with the nest attribute.) - if (Fn.front().isLiveIn(X86::R10) || Fn.front().isLiveIn(X86::R11)) - return false; - - // FIXME: Skip functions with conditional and r10 tail calls for now. - bool HasReturn = false; - for (auto &MBB : Fn) { - if (MBB.empty()) - continue; - - const MachineInstr &MI = MBB.instr_back(); - if (MI.isReturn()) - HasReturn = true; - - if (MI.isReturn() && MI.isCall()) { - if (MI.findRegisterUseOperand(X86::EFLAGS)) - return false; - // This should only be possible on Windows 64 (see GR64_TC versus - // GR64_TCW64.) - if (MI.findRegisterUseOperand(X86::R10) || - MI.hasRegisterImplicitUseOperand(X86::R10)) - return false; - } - } - - if (!HasReturn) - return false; - - // For leaf functions: - // 1. Do not instrument very short functions where it would not improve that - // function's security. - // 2. Detect if there is an unused caller-saved register we can reserve to - // hold the return address instead of writing/reading it from the shadow - // call stack. - MCPhysReg LeafFuncRegister = X86::NoRegister; - if (!Fn.getFrameInfo().adjustsStack()) { - size_t InstructionCount = 0; - std::bitset<X86::NUM_TARGET_REGS> UsedRegs; - for (auto &MBB : Fn) { - for (auto &LiveIn : MBB.liveins()) - UsedRegs.set(LiveIn.PhysReg); - for (auto &MI : MBB) { - if (!MI.isDebugValue() && !MI.isCFIInstruction() && !MI.isLabel()) - InstructionCount++; - for (auto &Op : MI.operands()) - if (Op.isReg() && Op.isDef()) - UsedRegs.set(Op.getReg()); - } - } - - if (InstructionCount <= SkipLeafInstructions) - return false; - - std::bitset<X86::NUM_TARGET_REGS> CalleeSavedRegs; - const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs(); - for (size_t i = 0; CSRegs[i]; i++) - CalleeSavedRegs.set(CSRegs[i]); - - const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); - for (auto &Reg : X86::GR64_NOSPRegClass.getRegisters()) { - // FIXME: Optimization opportunity: spill/restore a callee-saved register - // if a caller-saved register is unavailable. - if (CalleeSavedRegs.test(Reg)) - continue; - - bool Used = false; - for (MCSubRegIterator SR(Reg, TRI, true); SR.isValid(); ++SR) - if ((Used = UsedRegs.test(*SR))) - break; - - if (!Used) { - LeafFuncRegister = Reg; - break; - } - } - } - - const bool LeafFuncOptimization = LeafFuncRegister != X86::NoRegister; - if (LeafFuncOptimization) - // Mark the leaf function register live-in for all MBBs except the entry MBB - for (auto I = ++Fn.begin(), E = Fn.end(); I != E; ++I) - I->addLiveIn(LeafFuncRegister); - - MachineBasicBlock &MBB = Fn.front(); - const MachineBasicBlock *NonEmpty = MBB.empty() ? MBB.getFallThrough() : &MBB; - const DebugLoc &DL = NonEmpty->front().getDebugLoc(); - - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - if (LeafFuncOptimization) - addPrologLeaf(Fn, TII, MBB, DL, LeafFuncRegister); - else - addProlog(Fn, TII, MBB, DL); - - MachineBasicBlock *Trap = nullptr; - for (auto &MBB : Fn) { - if (MBB.empty()) - continue; - - MachineInstr &MI = MBB.instr_back(); - if (MI.isReturn()) { - if (!Trap) { - Trap = Fn.CreateMachineBasicBlock(); - BuildMI(Trap, MI.getDebugLoc(), TII->get(X86::TRAP)); - Fn.push_back(Trap); - } - - if (LeafFuncOptimization) - addEpilogLeaf(TII, MBB, MI, *Trap, LeafFuncRegister); - else if (MI.findRegisterUseOperand(X86::R11)) - addEpilogOnlyR10(TII, MBB, MI, *Trap); - else - addEpilog(TII, MBB, MI, *Trap); - } - } - - return true; -} - -INITIALIZE_PASS(ShadowCallStack, "shadow-call-stack", "Shadow Call Stack", - false, false) - -FunctionPass *llvm::createShadowCallStackPass() { - return new ShadowCallStack(); -} diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 16c2b56c48b5..47c41626a666 100644 --- a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -1,13 +1,12 @@ //===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h new file mode 100644 index 000000000000..caf6b8d424fc --- /dev/null +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.h @@ -0,0 +1,21 @@ +//===-- X86TargetInfo.h - X86 Target Implementation -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H +#define LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheX86_32Target(); +Target &getTheX86_64Target(); + +} + +#endif // LLVM_LIB_TARGET_X86_TARGETINFO_X86TARGETINFO_H diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index bed940d0d0e9..48fd3e0b7ab9 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -1,9 +1,8 @@ //===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -300,7 +299,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, unsigned HalfMask = Imm >> (l * 4); unsigned HalfBegin = (HalfMask & 0x3) * HalfSize; for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i) - ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i); + ShuffleMask.push_back((HalfMask & 8) ? SM_SentinelZero : (int)i); } } @@ -384,7 +383,8 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm, } void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, SmallVectorImpl<int> &Mask) { + unsigned NumDstElts, bool IsAnyExtend, + SmallVectorImpl<int> &Mask) { unsigned Scale = DstScalarBits / SrcScalarBits; assert(SrcScalarBits < DstScalarBits && "Expected zero extension mask to increase scalar size"); @@ -392,7 +392,7 @@ void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, for (unsigned i = 0; i != NumDstElts; i++) { Mask.push_back(i); for (unsigned j = 1; j != Scale; j++) - Mask.push_back(SM_SentinelZero); + Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero); } } diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 85cde14a3241..f52785063071 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -1,9 +1,8 @@ //===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -137,7 +136,7 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts, /// Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, - unsigned NumDstElts, + unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl<int> &ShuffleMask); /// Decode a move lower and zero upper instruction as a shuffle mask. diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 1c8813815b86..a95f68434d12 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -1,9 +1,8 @@ //===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass(); /// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// This pass instruments the function prolog to save the return address to a -/// 'shadow call stack' and the function epilog to check that the return address -/// did not change during function execution. -FunctionPass *createShadowCallStackPass(); - /// This pass inserts ENDBR instructions before indirect jump/call /// destinations as part of CET IBT mechanism. FunctionPass *createX86IndirectBranchTrackingPass(); @@ -138,11 +132,12 @@ FunctionPass *createX86SpeculativeLoadHardeningPass(); void initializeEvexToVexInstPassPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); -void initializeShadowCallStackPass(PassRegistry &); +void initializeFPSPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); +void initializeX86ExpandPseudoPass(PassRegistry&); void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 6b1749fc7500..3112f00c91f2 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -1,9 +1,8 @@ //===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,6 +39,9 @@ def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", "Enable conditional move instructions">; +def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true", + "Support CMPXCHG8B instructions">; + def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; @@ -165,9 +167,16 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", "Enable AVX-512 Vector Neural Network Instructions", [FeatureAVX512]>; +def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true", + "Support bfloat16 floating point", + [FeatureBWI]>; def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", "Enable AVX-512 Bit Algorithms", [FeatureBWI]>; +def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect", + "HasVP2INTERSECT", "true", + "Enable AVX-512 vp2intersect", + [FeatureAVX512]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -258,6 +267,8 @@ def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", "Support RDPID instructions">; def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; +def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", + "Has ENQCMD instructions">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands // should be avoided in favor of a MOV + register CALL/PUSH/POP. @@ -274,7 +285,7 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", - "Use software floating point features.">; + "Use software floating point features">; def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt", "HasPOPCNTFalseDeps", "true", "POPCNT has a false dependency on dest register">; @@ -342,6 +353,12 @@ def FeatureERMSB "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Bulldozer and newer processors can merge CMP/TEST (but not other +// instructions) with conditional branches. +def FeatureBranchFusion + : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", + "CMP/TEST can be fused with conditional branches">; + // Sandy Bridge and newer processors have many instructions that can be // fused with conditional branches and pass through the CPU as a single // operation. @@ -355,7 +372,7 @@ def FeatureMacroFusion // similar to Skylake Server (AVX-512). def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", - "Indicates if gather is reasonably fast.">; + "Indicates if gather is reasonably fast">; def FeaturePrefer256Bit : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", @@ -366,7 +383,7 @@ def FeaturePrefer256Bit def FeatureRetpolineIndirectCalls : SubtargetFeature< "retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true", - "Remove speculation of indirect calls from the generated code.">; + "Remove speculation of indirect calls from the generated code">; // Lower indirect branches and switches either using conditional branch trees // or using a special construct called a `retpoline` to mitigate potential @@ -374,7 +391,7 @@ def FeatureRetpolineIndirectCalls def FeatureRetpolineIndirectBranches : SubtargetFeature< "retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true", - "Remove speculation of indirect branches from the generated code.">; + "Remove speculation of indirect branches from the generated code">; // Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and // `retpoline-indirect-branches` above. @@ -382,7 +399,7 @@ def FeatureRetpoline : SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true", "Remove speculation of indirect branches from the " "generated code, either by avoiding them entirely or " - "lowering them with a speculation blocking construct.", + "lowering them with a speculation blocking construct", [FeatureRetpolineIndirectCalls, FeatureRetpolineIndirectBranches]>; @@ -395,7 +412,7 @@ def FeatureRetpolineExternalThunk "When lowering an indirect call or branch using a `retpoline`, rely " "on the specified user provided thunk rather than emitting one " "ourselves. Only has effect when combined with some other retpoline " - "feature.", [FeatureRetpolineIndirectCalls]>; + "feature", [FeatureRetpolineIndirectCalls]>; // Direct Move instructions. def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", @@ -405,7 +422,7 @@ def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true", "Indicates that the BEXTR instruction is implemented as a single uop " - "with good throughput.">; + "with good throughput">; // Combine vector math operations with shuffles into horizontal math // instructions if a CPU implements horizontal operations (introduced with @@ -416,12 +433,33 @@ def FeatureFastHorizontalOps "Prefer horizontal vector math instructions (haddp, phsub, etc.) over " "normal vector instructions with shuffles", [FeatureSSE3]>; +def FeatureFastScalarShiftMasks + : SubtargetFeature< + "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true", + "Prefer a left/right scalar logical shift pair over a shift+and pair">; + +def FeatureFastVectorShiftMasks + : SubtargetFeature< + "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true", + "Prefer a left/right vector logical shift pair over a shift+and pair">; + // Merge branches using three-way conditional code. def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "ThreewayBranchProfitable", "true", "Merge branches to a three-way " "conditional branch">; +// Bonnell +def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; +// Silvermont +def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">; +// Goldmont +def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">; +// Goldmont Plus +def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">; +// Tremont +def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -440,7 +478,7 @@ include "X86SchedPredicates.td" def X86InstrInfo : InstrInfo; //===----------------------------------------------------------------------===// -// X86 processors supported. +// X86 Scheduler Models //===----------------------------------------------------------------------===// include "X86ScheduleAtom.td" @@ -454,37 +492,468 @@ include "X86ScheduleBtVer2.td" include "X86SchedSkylakeClient.td" include "X86SchedSkylakeServer.td" -def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", - "Intel Atom processors">; -def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", - "Intel Silvermont processors">; -def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM", - "Intel Goldmont processors">; -def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP", - "Intel Goldmont Plus processors">; -def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM", - "Intel Tremont processors">; +//===----------------------------------------------------------------------===// +// X86 Processor Feature Lists +//===----------------------------------------------------------------------===// + +def ProcessorFeatures { + // Nehalem + list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureLAHFSAHF, + FeatureMacroFusion]; + list<SubtargetFeature> NHMSpecificFeatures = []; + list<SubtargetFeature> NHMFeatures = + !listconcat(NHMInheritableFeatures, NHMSpecificFeatures); + + // Westmere + list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL]; + list<SubtargetFeature> WSMSpecificFeatures = []; + list<SubtargetFeature> WSMInheritableFeatures = + !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures); + list<SubtargetFeature> WSMFeatures = + !listconcat(WSMInheritableFeatures, WSMSpecificFeatures); + + // Sandybridge + list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX, + FeatureSlowDivide64, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlow3OpsLEA, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureMergeToThreeWayBranch]; + list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> SNBInheritableFeatures = + !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures); + list<SubtargetFeature> SNBFeatures = + !listconcat(SNBInheritableFeatures, SNBSpecificFeatures); + + // Ivybridge + list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase]; + list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> IVBInheritableFeatures = + !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures); + list<SubtargetFeature> IVBFeatures = + !listconcat(IVBInheritableFeatures, IVBSpecificFeatures); + + // Haswell + list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureERMSB, + FeatureFMA, + FeatureINVPCID, + FeatureLZCNT, + FeatureMOVBE, + FeatureFastVariableShuffle]; + list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps]; + list<SubtargetFeature> HSWInheritableFeatures = + !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures); + list<SubtargetFeature> HSWFeatures = + !listconcat(HSWInheritableFeatures, HSWSpecificFeatures); + + // Broadwell + list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX, + FeatureRDSEED, + FeaturePRFCHW]; + list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps]; + list<SubtargetFeature> BDWInheritableFeatures = + !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures); + list<SubtargetFeature> BDWFeatures = + !listconcat(BDWInheritableFeatures, BDWSpecificFeatures); + + // Skylake + list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT]; + list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps, + FeatureSGX]; + list<SubtargetFeature> SKLInheritableFeatures = + !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures); + list<SubtargetFeature> SKLFeatures = + !listconcat(SKLInheritableFeatures, SKLSpecificFeatures); + + // Skylake-AVX512 + list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCLWB]; + list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> SKXInheritableFeatures = + !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures); + list<SubtargetFeature> SKXFeatures = + !listconcat(SKXInheritableFeatures, SKXSpecificFeatures); + + // Cascadelake + list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI]; + list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> CLXInheritableFeatures = + !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures); + list<SubtargetFeature> CLXFeatures = + !listconcat(CLXInheritableFeatures, CLXSpecificFeatures); + + // Cooperlake + list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16]; + list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> CPXInheritableFeatures = + !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures); + list<SubtargetFeature> CPXFeatures = + !listconcat(CPXInheritableFeatures, CPXSpecificFeatures); + + // Cannonlake + list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureVBMI, + FeatureIFMA, + FeatureSHA, + FeatureSGX]; + list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather]; + list<SubtargetFeature> CNLInheritableFeatures = + !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures); + list<SubtargetFeature> CNLFeatures = + !listconcat(CNLInheritableFeatures, CNLSpecificFeatures); + + // Icelake + list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG, + FeatureVAES, + FeatureVBMI2, + FeatureVNNI, + FeatureVPCLMULQDQ, + FeatureVPOPCNTDQ, + FeatureGFNI, + FeatureCLWB, + FeatureRDPID]; + list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather]; + list<SubtargetFeature> ICLInheritableFeatures = + !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures); + list<SubtargetFeature> ICLFeatures = + !listconcat(ICLInheritableFeatures, ICLSpecificFeatures); + + // Icelake Server + list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG, + FeatureWBNOINVD, + FeatureHasFastGather]; + list<SubtargetFeature> ICXFeatures = + !listconcat(ICLInheritableFeatures, ICXSpecificFeatures); + + // Atom + list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowTwoMemOps, + FeatureLAHFSAHF]; + list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom, + FeatureSlowUAMem16, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureLEAUsesAG, + FeaturePadShortFunctions]; + list<SubtargetFeature> AtomFeatures = + !listconcat(AtomInheritableFeatures, AtomSpecificFeatures); + + // Silvermont + list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42, + FeaturePOPCNT, + FeaturePCLMUL, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureRDRAND]; + list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM, + FeatureSlowDivide64, + FeatureSlowPMULLD, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> SLMInheritableFeatures = + !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures); + list<SubtargetFeature> SLMFeatures = + !listconcat(SLMInheritableFeatures, SLMSpecificFeatures); + + // Goldmont + list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES, + FeatureMPX, + FeatureSHA, + FeatureRDSEED, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFSGSBase]; + list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM, + FeaturePOPCNTFalseDeps]; + list<SubtargetFeature> GLMInheritableFeatures = + !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures); + list<SubtargetFeature> GLMFeatures = + !listconcat(GLMInheritableFeatures, GLMSpecificFeatures); + + // Goldmont Plus + list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE, + FeatureRDPID, + FeatureSGX]; + list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP]; + list<SubtargetFeature> GLPInheritableFeatures = + !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures); + list<SubtargetFeature> GLPFeatures = + !listconcat(GLPInheritableFeatures, GLPSpecificFeatures); + + // Tremont + list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE, + FeatureGFNI, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureWAITPKG]; + list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM]; + list<SubtargetFeature> TRMFeatures = + !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures, + TRMSpecificFeatures); + + // Knights Landing + list<SubtargetFeature> KNLFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureSlowDivide64, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureAES, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureAVX512, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeaturePREFETCHWT1, + FeatureADX, + FeatureRDSEED, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeaturePRFCHW, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather, + FeatureSlowPMADDWD]; + // TODO Add AVX5124FMAPS/AVX5124VNNIW features + list<SubtargetFeature> KNMFeatures = + !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); + + + // Bobcat + list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks]; + list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures; + + // Jaguar + list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureXSAVE, + FeatureXSAVEOPT]; + list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFastPartialYMMorZMMWrite, + FeatureFastHorizontalOps]; + list<SubtargetFeature> BtVer2InheritableFeatures = + !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); + list<SubtargetFeature> BtVer2Features = + !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures); + + // Bulldozer + list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureXOP, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureLWP, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureFastScalarShiftMasks, + FeatureBranchFusion]; + list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures; + + // PileDriver + list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureFastBEXTR]; + list<SubtargetFeature> BdVer2InheritableFeatures = + !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures); + list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures; + + // Steamroller + list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT, + FeatureFSGSBase]; + list<SubtargetFeature> BdVer3InheritableFeatures = + !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures); + list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures; + + // Excavator + list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2, + FeatureBMI2, + FeatureMWAITX]; + list<SubtargetFeature> BdVer4InheritableFeatures = + !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures); + list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures; + + + // AMD Zen Processors common ISAs + list<SubtargetFeature> ZNFeatures = [FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCLZERO, + FeatureCMOV, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureNOPL, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureFastBEXTR, + FeatureFast15ByteNOP, + FeatureBranchFusion, + FeatureFastScalarShiftMasks, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]; + list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, + FeatureRDPID, + FeatureWBNOINVD]; + list<SubtargetFeature> ZN2Features = + !listconcat(ZNFeatures, ZN2AdditionalFeatures); +} + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>; +// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled +// if i386/i486 is specifically requested. +def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; - -def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, - FeatureNOPL]>; - -def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureCMOV, FeatureFXSR, FeatureNOPL]>; +def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; +def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureMMX]>; + +def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureCMOV]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureCMOV, FeatureNOPL]>; + +def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureCMOV, FeatureFXSR, + FeatureNOPL]>; foreach P = ["pentium3", "pentium3m"] in { - def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, - FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX, + FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -498,13 +967,15 @@ foreach P = ["pentium3", "pentium3m"] in { // changes slightly. def : ProcessorModel<"pentium-m", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; foreach P = ["pentium4", "pentium4m"] in { def : ProcessorModel<P, GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; } // Intel Quark. @@ -512,16 +983,19 @@ def : Proc<"lakemont", []>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; // NetBurst. def : ProcessorModel<"prescott", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, + FeatureCMOV]>; def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE3, @@ -535,6 +1009,7 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSSE3, @@ -548,6 +1023,7 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, FeatureSlowUAMem16, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE41, @@ -560,638 +1036,131 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ ]>; // Atom CPUs. -class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ - ProcIntelAtom, - FeatureX87, - FeatureSlowUAMem16, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureLEAForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureSlowTwoMemOps, - FeatureLEAUsesAG, - FeaturePadShortFunctions, - FeatureLAHFSAHF -]>; -def : BonnellProc<"bonnell">; -def : BonnellProc<"atom">; // Pin the generic name to the baseline. - -class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ - ProcIntelSLM, - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureSlowDivide64, - FeatureSlowTwoMemOps, - FeaturePRFCHW, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureSlowPMULLD, - FeatureRDRAND, - FeatureLAHFSAHF, - FeaturePOPCNTFalseDeps -]>; -def : SilvermontProc<"silvermont">; -def : SilvermontProc<"slm">; // Legacy alias. - -class ProcessorFeatures<list<SubtargetFeature> Inherited, - list<SubtargetFeature> NewFeatures> { - list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures); +foreach P = ["bonnell", "atom"] in { + def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>; } -class ProcModel<string Name, SchedMachineModel Model, - list<SubtargetFeature> ProcFeatures, - list<SubtargetFeature> OtherFeatures> : - ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>; - -def GLMFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureAES, - FeaturePRFCHW, - FeatureSlowTwoMemOps, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureLAHFSAHF, - FeatureMPX, - FeatureSHA, - FeatureRDRAND, - FeatureRDSEED, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureXSAVEC, - FeatureXSAVES, - FeatureCLFLUSHOPT, - FeatureFSGSBase -]>; +foreach P = ["silvermont", "slm"] in { + def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>; +} -class GoldmontProc<string Name> : ProcModel<Name, SLMModel, - GLMFeatures.Value, [ - ProcIntelGLM, - FeaturePOPCNTFalseDeps -]>; -def : GoldmontProc<"goldmont">; - -def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [ - FeaturePTWRITE, - FeatureRDPID, - FeatureSGX -]>; - -class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel, - GLPFeatures.Value, [ - ProcIntelGLP -]>; -def : GoldmontPlusProc<"goldmont-plus">; - -class TremontProc<string Name> : ProcModel<Name, SLMModel, - GLPFeatures.Value, [ - ProcIntelTRM, - FeatureCLDEMOTE, - FeatureGFNI, - FeatureMOVDIRI, - FeatureMOVDIR64B, - FeatureWAITPKG -]>; -def : TremontProc<"tremont">; +def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>; +def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>; +def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>; // "Arrandale" along with corei3 and corei5 -class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureLAHFSAHF, - FeatureMacroFusion -]>; -def : NehalemProc<"nehalem">; -def : NehalemProc<"corei7">; +foreach P = ["nehalem", "corei7"] in { + def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>; +} -// Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureLAHFSAHF, - FeatureMacroFusion -]>; -def : WestmereProc<"westmere">; - -// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, -// rather than a superset. -def SNBFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureSlowDivide64, - FeaturePCLMUL, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureLAHFSAHF, - FeatureSlow3OpsLEA, - FeatureFastScalarFSQRT, - FeatureFastSHLDRotate, - FeatureSlowIncDec, - FeatureMergeToThreeWayBranch, - FeatureMacroFusion -]>; - -class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, - SNBFeatures.Value, [ - FeatureSlowUAMem32, - FeaturePOPCNTFalseDeps -]>; -def : SandyBridgeProc<"sandybridge">; -def : SandyBridgeProc<"corei7-avx">; // Legacy alias. - -def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [ - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase -]>; - -class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, - IVBFeatures.Value, [ - FeatureSlowUAMem32, - FeaturePOPCNTFalseDeps -]>; -def : IvyBridgeProc<"ivybridge">; -def : IvyBridgeProc<"core-avx-i">; // Legacy alias. - -def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ - FeatureAVX2, - FeatureBMI, - FeatureBMI2, - FeatureERMSB, - FeatureFMA, - FeatureINVPCID, - FeatureLZCNT, - FeatureMOVBE, - FeatureFastVariableShuffle -]>; - -class HaswellProc<string Name> : ProcModel<Name, HaswellModel, - HSWFeatures.Value, [ - FeaturePOPCNTFalseDeps, - FeatureLZCNTFalseDeps -]>; -def : HaswellProc<"haswell">; -def : HaswellProc<"core-avx2">; // Legacy alias. +def : ProcessorModel<"westmere", SandyBridgeModel, + ProcessorFeatures.WSMFeatures>; -def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [ - FeatureADX, - FeatureRDSEED, - FeaturePRFCHW -]>; -class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel, - BDWFeatures.Value, [ - FeaturePOPCNTFalseDeps, - FeatureLZCNTFalseDeps -]>; -def : BroadwellProc<"broadwell">; - -def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ - FeatureAES, - FeatureMPX, - FeatureXSAVEC, - FeatureXSAVES, - FeatureCLFLUSHOPT, - FeatureFastVectorFSQRT -]>; - -class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel, - SKLFeatures.Value, [ - FeatureHasFastGather, - FeaturePOPCNTFalseDeps, - FeatureSGX -]>; -def : SkylakeClientProc<"skylake">; +foreach P = ["sandybridge", "corei7-avx"] in { + def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>; +} -def KNLFeatures : ProcessorFeatures<[], [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureSlowDivide64, - FeaturePCLMUL, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureLAHFSAHF, - FeatureSlow3OpsLEA, - FeatureSlowIncDec, - FeatureAES, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureAVX512, - FeatureERI, - FeatureCDI, - FeaturePFI, - FeaturePREFETCHWT1, - FeatureADX, - FeatureRDSEED, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeaturePRFCHW -]>; +foreach P = ["ivybridge", "core-avx-i"] in { + def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>; +} -// FIXME: define KNL model -class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, - KNLFeatures.Value, [ - FeatureSlowTwoMemOps, - FeatureFastPartialYMMorZMMWrite, - FeatureHasFastGather, - FeatureSlowPMADDWD -]>; -def : KnightsLandingProc<"knl">; - -class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel, - KNLFeatures.Value, [ - FeatureSlowTwoMemOps, - FeatureFastPartialYMMorZMMWrite, - FeatureHasFastGather, - FeatureSlowPMADDWD, - FeatureVPOPCNTDQ -]>; -def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features - -def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ - FeatureAVX512, - FeatureCDI, - FeatureDQI, - FeatureBWI, - FeatureVLX, - FeaturePKU, - FeatureCLWB -]>; +foreach P = ["haswell", "core-avx2"] in { + def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>; +} -class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, - SKXFeatures.Value, [ - FeatureHasFastGather, - FeaturePOPCNTFalseDeps -]>; -def : SkylakeServerProc<"skylake-avx512">; -def : SkylakeServerProc<"skx">; // Legacy alias. +def : ProcessorModel<"broadwell", BroadwellModel, + ProcessorFeatures.BDWFeatures>; -def CLXFeatures : ProcessorFeatures<SKXFeatures.Value, [ - FeatureVNNI -]>; +def : ProcessorModel<"skylake", SkylakeClientModel, + ProcessorFeatures.SKLFeatures>; -class CascadelakeProc<string Name> : ProcModel<Name, SkylakeServerModel, - CLXFeatures.Value, [ - FeatureHasFastGather, - FeaturePOPCNTFalseDeps -]>; -def : CascadelakeProc<"cascadelake">; - -def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [ - FeatureAVX512, - FeatureCDI, - FeatureDQI, - FeatureBWI, - FeatureVLX, - FeaturePKU, - FeatureVBMI, - FeatureIFMA, - FeatureSHA, - FeatureSGX -]>; +// FIXME: define KNL scheduler model +def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>; +def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>; -class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel, - CNLFeatures.Value, [ - FeatureHasFastGather -]>; -def : CannonlakeProc<"cannonlake">; - -def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [ - FeatureBITALG, - FeatureVAES, - FeatureVBMI2, - FeatureVNNI, - FeatureVPCLMULQDQ, - FeatureVPOPCNTDQ, - FeatureGFNI, - FeatureCLWB, - FeatureRDPID -]>; - -class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel, - ICLFeatures.Value, [ - FeatureHasFastGather -]>; -def : IcelakeClientProc<"icelake-client">; +foreach P = ["skylake-avx512", "skx"] in { + def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>; +} -class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, - ICLFeatures.Value, [ - FeaturePCONFIG, - FeatureWBNOINVD, - FeatureHasFastGather -]>; -def : IcelakeServerProc<"icelake-server">; +def : ProcessorModel<"cascadelake", SkylakeServerModel, + ProcessorFeatures.CLXFeatures>; +def : ProcessorModel<"cooperlake", SkylakeServerModel, + ProcessorFeatures.CPXFeatures>; +def : ProcessorModel<"cannonlake", SkylakeServerModel, + ProcessorFeatures.CNLFeatures>; +def : ProcessorModel<"icelake-client", SkylakeServerModel, + ProcessorFeatures.ICLFeatures>; +def : ProcessorModel<"icelake-server", SkylakeServerModel, + ProcessorFeatures.ICXFeatures>; // AMD CPUs. -def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX]>; +def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNow]>; +def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNow]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, Feature3DNowA, - FeatureNOPL, FeatureSlowSHLD]>; + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, + Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { - def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, FeatureSSE1, - Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>; + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, + FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL, + FeatureSlowSHLD]>; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, - FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD, - FeatureCMOV]>; + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL, + Feature64Bit, FeatureSlowSHLD, FeatureCMOV, + FeatureFastScalarShiftMasks]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, - FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD, - FeatureCMOV, Feature64Bit]>; + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3, + Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, + FeatureSlowSHLD, FeatureCMOV, Feature64Bit, + FeatureFastScalarShiftMasks]>; } foreach P = ["amdfam10", "barcelona"] in { - def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR, - FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, Feature64Bit]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA, + FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, + Feature64Bit, FeatureFastScalarShiftMasks]>; } // Bobcat -def : Proc<"btver1", [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureSSE4A, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast15ByteNOP -]>; - +def : Proc<"btver1", ProcessorFeatures.BtVer1Features>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureAES, - FeaturePCLMUL, - FeatureBMI, - FeatureF16C, - FeatureMOVBE, - FeatureLZCNT, - FeatureFastLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast15ByteNOP, - FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite, - FeatureFastHorizontalOps -]>; +def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>; // Bulldozer -def : ProcessorModel<"bdver1", BdVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureLWP, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureMacroFusion -]>; +def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>; // Piledriver -def : ProcessorModel<"bdver2", BdVer2Model, [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureFastBEXTR, - FeatureMacroFusion -]>; - +def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>; // Steamroller -def : Proc<"bdver3", [ - FeatureX87, - FeatureCMOV, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureAVX, - FeatureFXSR, - FeatureNOPL, - FeatureSSE4A, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureFSGSBase, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureFastBEXTR, - FeatureMacroFusion -]>; - +def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>; // Excavator -def : Proc<"bdver4", [ - FeatureX87, - FeatureCMOV, - FeatureMMX, - FeatureAVX2, - FeatureFXSR, - FeatureNOPL, - FeatureXOP, - FeatureFMA4, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureF16C, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureBMI, - FeatureBMI2, - FeatureTBM, - FeatureLWP, - FeatureFMA, - FeatureXSAVEOPT, - FeatureSlowSHLD, - FeatureFSGSBase, - FeatureLAHFSAHF, - FeatureFastBEXTR, - FeatureFast11ByteNOP, - FeatureMWAITX, - FeatureMacroFusion -]>; +def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>; -// Znver1 -def: ProcessorModel<"znver1", Znver1Model, [ - FeatureADX, - FeatureAES, - FeatureAVX2, - FeatureBMI, - FeatureBMI2, - FeatureCLFLUSHOPT, - FeatureCLZERO, - FeatureCMOV, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureF16C, - FeatureFMA, - FeatureFSGSBase, - FeatureFXSR, - FeatureNOPL, - FeatureFastLZCNT, - FeatureLAHFSAHF, - FeatureLZCNT, - FeatureFastBEXTR, - FeatureFast15ByteNOP, - FeatureMacroFusion, - FeatureMMX, - FeatureMOVBE, - FeatureMWAITX, - FeaturePCLMUL, - FeaturePOPCNT, - FeaturePRFCHW, - FeatureRDRAND, - FeatureRDSEED, - FeatureSHA, - FeatureSSE4A, - FeatureSlowSHLD, - FeatureX87, - FeatureXSAVE, - FeatureXSAVEC, - FeatureXSAVEOPT, - FeatureXSAVES]>; +def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>; +def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>; -def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; +def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureCMOV]>; +def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, + FeatureMMX, FeatureSSE1, FeatureFXSR, + FeatureCMOV]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1205,6 +1174,7 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, [ FeatureX87, + FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2, diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 36cef98a1ef5..80120722e0e6 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" -#include "InstPrinter/X86ATTInstPrinter.h" +#include "MCTargetDesc/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86TargetStreamer.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/BinaryFormat/COFF.h" @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -104,16 +105,16 @@ void X86AsmPrinter::EmitFunctionBodyEnd() { } } -/// printSymbolOperand - Print a raw symbol reference operand. This handles +/// PrintSymbolOperand - Print a raw symbol reference operand. This handles /// jump tables, constant pools, global address and external symbols, all of /// which print to a label with various suffixes for relocation types etc. -static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, - raw_ostream &O) { +void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { switch (MO.getType()) { default: llvm_unreachable("unknown symbol type!"); case MachineOperand::MO_ConstantPoolIndex: - P.GetCPISymbol(MO.getIndex())->print(O, P.MAI); - P.printOffset(MO.getOffset(), O); + GetCPISymbol(MO.getIndex())->print(O, MAI); + printOffset(MO.getOffset(), O); break; case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); @@ -121,38 +122,37 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, MCSymbol *GVSym; if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) - GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); else - GVSym = P.getSymbol(GV); + GVSym = getSymbol(GV); // Handle dllimport linkage. if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) - GVSym = - P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + GVSym = OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); else if (MO.getTargetFlags() == X86II::MO_COFFSTUB) GVSym = - P.OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName()); + OutContext.getOrCreateSymbol(Twine(".refptr.") + GVSym->getName()); if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { - MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MCSymbol *Sym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = - P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl:: - StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); } // If the name begins with a dollar-sign, enclose it in parens. We do this // to avoid having it look like an integer immediate to the assembler. if (GVSym->getName()[0] != '$') - GVSym->print(O, P.MAI); + GVSym->print(O, MAI); else { O << '('; - GVSym->print(O, P.MAI); + GVSym->print(O, MAI); O << ')'; } - P.printOffset(MO.getOffset(), O); + printOffset(MO.getOffset(), O); break; } } @@ -169,13 +169,13 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, break; case X86II::MO_GOT_ABSOLUTE_ADDRESS: O << " + [.-"; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); O << ']'; break; case X86II::MO_PIC_BASE_OFFSET: case X86II::MO_DARWIN_NONLAZY_PIC_BASE: O << '-'; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); break; case X86II::MO_TLSGD: O << "@TLSGD"; break; case X86II::MO_TLSLD: O << "@TLSLD"; break; @@ -193,76 +193,91 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, case X86II::MO_TLVP: O << "@TLVP"; break; case X86II::MO_TLVP_PIC_BASE: O << "@TLVP" << '-'; - P.MF->getPICBaseSymbol()->print(O, P.MAI); + MF->getPICBaseSymbol()->print(O, MAI); break; case X86II::MO_SECREL: O << "@SECREL32"; break; } } -static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O, - const char *Modifier = nullptr, unsigned AsmVariant = 0); - -/// printPCRelImm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value. These print slightly differently, for -/// example, a $ is not emitted. -static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O) { +void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); + const bool IsATT = MI->getInlineAsmDialect() == InlineAsm::AD_ATT; switch (MO.getType()) { - default: llvm_unreachable("Unknown pcrel immediate operand"); - case MachineOperand::MO_Register: - // pc-relativeness was handled when computing the value in the reg. - printOperand(P, MI, OpNo, O); + default: llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Register: { + if (IsATT) + O << '%'; + O << X86ATTInstPrinter::getRegisterName(MO.getReg()); return; + } + case MachineOperand::MO_Immediate: + if (IsATT) + O << '$'; O << MO.getImm(); return; - case MachineOperand::MO_GlobalAddress: - printSymbolOperand(P, MO, O); - return; + + case MachineOperand::MO_GlobalAddress: { + if (IsATT) + O << '$'; + PrintSymbolOperand(MO, O); + break; + } + case MachineOperand::MO_BlockAddress: { + MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress()); + Sym->print(O, MAI); + break; + } } } -static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, - unsigned OpNo, raw_ostream &O, const char *Modifier, - unsigned AsmVariant) { +/// PrintModifiedOperand - Print subregisters based on supplied modifier, +/// deferring to PrintOperand() if no modifier was supplied or if operand is not +/// a register. +void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { const MachineOperand &MO = MI->getOperand(OpNo); - switch (MO.getType()) { - default: llvm_unreachable("unknown operand type!"); - case MachineOperand::MO_Register: { - // FIXME: Enumerating AsmVariant, so we can remove magic number. - if (AsmVariant == 0) O << '%'; - unsigned Reg = MO.getReg(); - if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : - (strcmp(Modifier+6,"32") == 0) ? 32 : - (strcmp(Modifier+6,"16") == 0) ? 16 : 8; - Reg = getX86SubSuperRegister(Reg, Size); - } - O << X86ATTInstPrinter::getRegisterName(Reg); - return; + if (!Modifier || MO.getType() != MachineOperand::MO_Register) + return PrintOperand(MI, OpNo, O); + if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) + O << '%'; + unsigned Reg = MO.getReg(); + if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); } + O << X86ATTInstPrinter::getRegisterName(Reg); +} +/// PrintPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. These print slightly differently, for +/// example, a $ is not emitted. +void X86AsmPrinter::PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("Unknown pcrel immediate operand"); + case MachineOperand::MO_Register: + // pc-relativeness was handled when computing the value in the reg. + PrintOperand(MI, OpNo, O); + return; case MachineOperand::MO_Immediate: - if (AsmVariant == 0) O << '$'; O << MO.getImm(); return; - - case MachineOperand::MO_GlobalAddress: { - if (AsmVariant == 0) O << '$'; - printSymbolOperand(P, MO, O); - break; - } + case MachineOperand::MO_GlobalAddress: + PrintSymbolOperand(MO, O); + return; } } -static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr) { - const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); +void X86AsmPrinter::PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg); + const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp); // If we really don't want to print out (rip), don't. bool HasBaseReg = BaseReg.getReg() != 0; @@ -284,7 +299,8 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, } case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ConstantPoolIndex: - printSymbolOperand(P, DispSpec, O); + PrintSymbolOperand(DispSpec, O); + break; } if (Modifier && strcmp(Modifier, "H") == 0) @@ -296,12 +312,12 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, O << '('; if (HasBaseReg) - printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier); + PrintModifiedOperand(MI, OpNo + X86::AddrBaseReg, O, Modifier); if (IndexReg.getReg()) { O << ','; - printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + PrintModifiedOperand(MI, OpNo + X86::AddrIndexReg, O, Modifier); + unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm(); if (ScaleVal != 1) O << ',' << ScaleVal; } @@ -309,31 +325,28 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, } } -static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr) { - assert(isMem(*MI, Op) && "Invalid memory reference!"); - const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg); +void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + assert(isMem(*MI, OpNo) && "Invalid memory reference!"); + const MachineOperand &Segment = MI->getOperand(OpNo + X86::AddrSegmentReg); if (Segment.getReg()) { - printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier); + PrintModifiedOperand(MI, OpNo + X86::AddrSegmentReg, O, Modifier); O << ':'; } - printLeaMemReference(P, MI, Op, O, Modifier); + PrintLeaMemReference(MI, OpNo, O, Modifier); } -static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, - unsigned Op, raw_ostream &O, - const char *Modifier = nullptr, - unsigned AsmVariant = 1) { - const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); - const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); - const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); +void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, + unsigned OpNo, raw_ostream &O) { + const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm(); + const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp); + const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg); // If this has a segment register, print it. if (SegReg.getReg()) { - printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrSegmentReg, O); O << ':'; } @@ -341,7 +354,7 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, bool NeedPlus = false; if (BaseReg.getReg()) { - printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrBaseReg, O); NeedPlus = true; } @@ -349,13 +362,13 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, if (NeedPlus) O << " + "; if (ScaleVal != 1) O << ScaleVal << '*'; - printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrIndexReg, O); NeedPlus = true; } if (!DispSpec.isImm()) { if (NeedPlus) O << " + "; - printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant); + PrintOperand(MI, OpNo + X86::AddrDisp, O); } else { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { @@ -418,7 +431,6 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, /// PrintAsmOperand - Print out an operand for an inline asm expression. /// bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { @@ -429,7 +441,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, switch (ExtraCode[0]) { default: // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O); case 'a': // This is an address. Currently only 'i' and 'r' are expected. switch (MO.getType()) { default: @@ -442,13 +454,13 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case MachineOperand::MO_ExternalSymbol: llvm_unreachable("unexpected operand type!"); case MachineOperand::MO_GlobalAddress: - printSymbolOperand(*this, MO, O); + PrintSymbolOperand(MO, O); if (Subtarget->isPICStyleRIPRel()) O << "(%rip)"; return false; case MachineOperand::MO_Register: O << '('; - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); O << ')'; return false; } @@ -456,7 +468,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'c': // Don't print "$" before a global var name or constant. switch (MO.getType()) { default: - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); break; case MachineOperand::MO_Immediate: O << MO.getImm(); @@ -466,7 +478,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case MachineOperand::MO_ExternalSymbol: llvm_unreachable("unexpected operand type!"); case MachineOperand::MO_GlobalAddress: - printSymbolOperand(*this, MO, O); + PrintSymbolOperand(MO, O); break; } return false; @@ -474,7 +486,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'A': // Print '*' before a register (it must be a register) if (MO.isReg()) { O << '*'; - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); return false; } return true; @@ -487,11 +499,11 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); - printOperand(*this, MI, OpNo, O); + PrintOperand(MI, OpNo, O); return false; case 'P': // This is the operand of a call, treat specially. - printPCRelImm(*this, MI, OpNo, O); + PrintPCRelImm(MI, OpNo, O); return false; case 'n': // Negate the immediate or print a '-' before the operand. @@ -505,16 +517,15 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } } - printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant); + PrintOperand(MI, OpNo, O); return false; } -bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, +bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) { - if (AsmVariant) { - printIntelMemReference(*this, MI, OpNo, O); + if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { + PrintIntelMemReference(MI, OpNo, O); return false; } @@ -531,14 +542,14 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, // These only apply to registers, ignore on mem. break; case 'H': - printMemReference(*this, MI, OpNo, O, "H"); + PrintMemReference(MI, OpNo, O, "H"); return false; case 'P': // Don't print @PLT, but do print as memory. - printMemReference(*this, MI, OpNo, O, "no-rip"); + PrintMemReference(MI, OpNo, O, "no-rip"); return false; } } - printMemReference(*this, MI, OpNo, O); + PrintMemReference(MI, OpNo, O, nullptr); return false; } @@ -683,26 +694,31 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { // stripping. Since LLVM never generates code that does this, it is always // safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - return; - } - - if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) { - StringRef SymbolName = - (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused"; - MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); - OutStreamer->EmitSymbolAttribute(S, MCSA_Global); - return; - } - - if (TT.isOSBinFormatCOFF()) { + } else if (TT.isOSBinFormatCOFF()) { + if (MMI->usesMSVCFloatingPoint()) { + // In Windows' libcmt.lib, there is a file which is linked in only if the + // symbol _fltused is referenced. Linking this in causes some + // side-effects: + // + // 1. For x86-32, it will set the x87 rounding mode to 53-bit instead of + // 64-bit mantissas at program start. + // + // 2. It links in support routines for floating-point in scanf and printf. + // + // MSVC emits an undefined reference to _fltused when there are any + // floating point operations in the program (including calls). A program + // that only has: `scanf("%f", &global_float);` may fail to trigger this, + // but oh well...that's a documented issue. + StringRef SymbolName = + (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused"; + MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + return; + } emitStackMaps(SM); - return; - } - - if (TT.isOSBinFormatELF()) { + } else if (TT.isOSBinFormatELF()) { emitStackMaps(SM); FM.serializeToFaultMapSection(); - return; } } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 55abdf2ba601..a011310970b3 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -1,9 +1,8 @@ //===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -103,6 +102,18 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // Choose between emitting .seh_ directives and .cv_fpo_ directives. void EmitSEHInstruction(const MachineInstr *MI); + void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override; + void PrintOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + void PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier); + void PrintPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + void PrintLeaMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier); + void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier); + void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O); + public: X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); @@ -124,11 +135,9 @@ public: } bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &OS) override; + const char *ExtraCode, raw_ostream &OS) override; bool doInitialization(Module &M) override { SMShadowTracker.reset(0); diff --git a/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 627a6cb14514..3dcc1015dc7c 100644 --- a/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/contrib/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -1,9 +1,8 @@ //===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -69,9 +68,7 @@ using DisplacementSizeMap = std::map<int64_t, unsigned>; class X86AvoidSFBPass : public MachineFunctionPass { public: static char ID; - X86AvoidSFBPass() : MachineFunctionPass(ID) { - initializeX86AvoidSFBPassPass(*PassRegistry::getPassRegistry()); - } + X86AvoidSFBPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 Avoid Store Forwarding Blocks"; @@ -343,6 +340,8 @@ findPotentialBlockers(MachineInstr *LoadInst) { for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)), E = LoadInst->getParent()->rend(); PBInst != E; ++PBInst) { + if (PBInst->isMetaInstruction()) + continue; BlockCount++; if (BlockCount >= InspectionLimit) break; @@ -366,6 +365,8 @@ findPotentialBlockers(MachineInstr *LoadInst) { for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(), PME = PMBB->rend(); PBInst != PME; ++PBInst) { + if (PBInst->isMetaInstruction()) + continue; PredCount++; if (PredCount >= LimitLeft) break; @@ -407,7 +408,10 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, // If the load and store are consecutive, use the loadInst location to // reduce register pressure. MachineInstr *StInst = StoreInst; - if (StoreInst->getPrevNode() == LoadInst) + auto PrevInstrIt = skipDebugInstructionsBackward( + std::prev(MachineBasicBlock::instr_iterator(StoreInst)), + MBB->instr_begin()); + if (PrevInstrIt.getNodePtr() == LoadInst) StInst = LoadInst; MachineInstr *NewStore = BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode)) @@ -492,19 +496,22 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst, static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) { MachineOperand &LoadBase = getBaseOperand(LoadInst); MachineOperand &StoreBase = getBaseOperand(StoreInst); + auto StorePrevNonDbgInstr = skipDebugInstructionsBackward( + std::prev(MachineBasicBlock::instr_iterator(StoreInst)), + LoadInst->getParent()->instr_begin()).getNodePtr(); if (LoadBase.isReg()) { MachineInstr *LastLoad = LoadInst->getPrevNode(); // If the original load and store to xmm/ymm were consecutive // then the partial copies were also created in // a consecutive order to reduce register pressure, // and the location of the last load is before the last store. - if (StoreInst->getPrevNode() == LoadInst) + if (StorePrevNonDbgInstr == LoadInst) LastLoad = LoadInst->getPrevNode()->getPrevNode(); getBaseOperand(LastLoad).setIsKill(LoadBase.isKill()); } if (StoreBase.isReg()) { MachineInstr *StInst = StoreInst; - if (StoreInst->getPrevNode() == LoadInst) + if (StorePrevNonDbgInstr == LoadInst) StInst = LoadInst; getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill()); } @@ -531,7 +538,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { if (!isPotentialBlockedMemCpyLd(MI.getOpcode())) continue; int DefVR = MI.getOperand(0).getReg(); - if (!MRI->hasOneUse(DefVR)) + if (!MRI->hasOneNonDBGUse(DefVR)) continue; for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end(); UI != UE;) { diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 24d7a219e751..4df849a2e14c 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -1,9 +1,8 @@ //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,10 +59,7 @@ namespace { class X86CallFrameOptimization : public MachineFunctionPass { public: - X86CallFrameOptimization() : MachineFunctionPass(ID) { - initializeX86CallFrameOptimizationPass( - *PassRegistry::getPassRegistry()); - } + X86CallFrameOptimization() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp index 1dc83b76595d..b16b3839c85a 100644 --- a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp @@ -1,9 +1,8 @@ //===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -48,8 +47,6 @@ using namespace llvm; -#include "X86GenCallingConv.inc" - X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) : CallLowering(&TLI) {} @@ -64,6 +61,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, SmallVector<EVT, 4> SplitVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + assert(OrigArg.Regs.size() == 1 && "Can't handle multple regs yet"); if (OrigArg.Ty->isVoidTy()) return true; @@ -73,12 +71,12 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, if (NumParts == 1) { // replace the original type ( pointer -> GPR ). - SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context), + SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Context), OrigArg.Flags, OrigArg.IsFixed); return true; } - SmallVector<unsigned, 8> SplitRegs; + SmallVector<Register, 8> SplitRegs; EVT PartVT = TLI.getRegisterType(Context, VT); Type *PartTy = PartVT.getTypeForEVT(Context); @@ -88,7 +86,7 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)), PartTy, OrigArg.Flags}; SplitArgs.push_back(Info); - SplitRegs.push_back(Info.Reg); + SplitRegs.push_back(Info.Regs[0]); } PerformArgSplit(SplitRegs); @@ -104,28 +102,28 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { DL(MIRBuilder.getMF().getDataLayout()), STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); LLT SType = LLT::scalar(DL.getPointerSizeInBits(0)); - unsigned SPReg = MRI.createGenericVirtualRegister(p0); + Register SPReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister()); - unsigned OffsetReg = MRI.createGenericVirtualRegister(SType); + Register OffsetReg = MRI.createGenericVirtualRegister(SType); MIRBuilder.buildConstant(OffsetReg, Offset); - unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); - unsigned ExtReg; + Register ExtReg; // If we are copying the value to a physical register with the // size larger than the size of the value itself - build AnyExt // to the size of the register first and only then do the copy. @@ -146,12 +144,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MIRBuilder.buildCopy(PhysReg, ExtReg); } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - unsigned ExtReg = extendRegister(ValVReg, VA); + Register ExtReg = extendRegister(ValVReg, VA); auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), - /* Alignment */ 0); + /* Alignment */ 1); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -185,7 +183,7 @@ protected: bool X86CallLowering::lowerReturn( MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<unsigned> VRegs) const { + ArrayRef<Register> VRegs) const { assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && "Return value without a vreg"); auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0); @@ -208,7 +206,7 @@ bool X86CallLowering::lowerReturn( ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)}; setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); if (!splitToValueTypes(CurArgInfo, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { + [&](ArrayRef<Register> Regs) { MIRBuilder.buildUnmerge(Regs, VRegs[i]); })) return false; @@ -231,7 +229,9 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { : ValueHandler(MIRBuilder, MRI, AssignFn), DL(MIRBuilder.getMF().getDataLayout()) {} - unsigned getStackAddress(uint64_t Size, int64_t Offset, + bool isArgumentHandler() const override { return true; } + + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); @@ -243,15 +243,15 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { return AddrReg; } - void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 0); + 1); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } - void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); @@ -320,9 +320,9 @@ protected: } // end anonymous namespace -bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef<unsigned> VRegs) const { +bool X86CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const { if (F.arg_empty()) return true; @@ -344,14 +344,14 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, Arg.hasAttribute(Attribute::StructRet) || Arg.hasAttribute(Attribute::SwiftSelf) || Arg.hasAttribute(Attribute::SwiftError) || - Arg.hasAttribute(Attribute::Nest)) + Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) return false; ArgInfo OrigArg(VRegs[Idx], Arg.getType()); setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { - MIRBuilder.buildMerge(VRegs[Idx], Regs); + [&](ArrayRef<Register> Regs) { + MIRBuilder.buildMerge(VRegs[Idx][0], Regs); })) return false; Idx++; @@ -409,9 +409,12 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (OrigArg.Flags.isByVal()) return false; + if (OrigArg.Regs.size() > 1) + return false; + if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { - MIRBuilder.buildUnmerge(Regs, OrigArg.Reg); + [&](ArrayRef<Register> Regs) { + MIRBuilder.buildUnmerge(Regs, OrigArg.Regs[0]); })) return false; } @@ -451,12 +454,15 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (OrigRet.Reg) { + if (!OrigRet.Ty->isVoidTy()) { + if (OrigRet.Regs.size() > 1) + return false; + SplitArgs.clear(); - SmallVector<unsigned, 8> NewRegs; + SmallVector<Register, 8> NewRegs; if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { + [&](ArrayRef<Register> Regs) { NewRegs.assign(Regs.begin(), Regs.end()); })) return false; @@ -466,7 +472,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; if (!NewRegs.empty()) - MIRBuilder.buildMerge(OrigRet.Reg, NewRegs); + MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs); } CallSeqStart.addImm(Handler.getStackSize()) diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h index f5f8f9a3ef6d..0445331bc3ff 100644 --- a/contrib/llvm/lib/Target/X86/X86CallLowering.h +++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h @@ -1,9 +1,8 @@ //===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,10 +29,10 @@ public: X86CallLowering(const X86TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<unsigned> VRegs) const override; + ArrayRef<Register> VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<unsigned> VRegs) const override; + ArrayRef<ArrayRef<Register>> VRegs) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, @@ -41,7 +40,7 @@ public: private: /// A function of this type is used to perform value split action. - using SplitArgTy = std::function<void(ArrayRef<unsigned>)>; + using SplitArgTy = std::function<void(ArrayRef<Register>)>; bool splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp index 59dde982f512..aee344a26764 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp @@ -1,9 +1,8 @@ //=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,16 +11,23 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86CallingConv.h" #include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" -namespace llvm { - -bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +using namespace llvm; + +/// When regcall calling convention compiled to 32 bit arch, special treatment +/// is required for 64 bit masks. +/// The value should be assigned to two GPRs. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { // List of GPR registers that are available to store values in regcall // calling convention. static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI, @@ -113,9 +119,15 @@ static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT, return false; } -bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 64 bit arch. +/// For HVAs shadow registers might be allocated on the first pass +/// and actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { // On the second pass, go through the HVAs only. if (ArgFlags.isSecArgPass()) { if (ArgFlags.isHva()) @@ -150,7 +162,10 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, // created on top of the basic 32 bytes of win64. // It can happen if the fifth or sixth argument is vector type or HVA. // At that case for each argument a shadow stack of 8 bytes is allocated. - if (Reg == X86::XMM4 || Reg == X86::XMM5) + const TargetRegisterInfo *TRI = + State.getMachineFunction().getSubtarget().getRegisterInfo(); + if (TRI->regsOverlap(Reg, X86::XMM4) || + TRI->regsOverlap(Reg, X86::XMM5)) State.AllocateStack(8, 8); if (!ArgFlags.isHva()) { @@ -165,9 +180,14 @@ bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return ArgFlags.isHva(); } -bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 32 bit arch. +/// For HVAs actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { // On the second pass, go through the HVAs only. if (ArgFlags.isSecArgPass()) { if (ArgFlags.isHva()) @@ -205,4 +225,110 @@ bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return false; // No register was assigned - Continue the search. } -} // End llvm namespace +static bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " + "stackmap and patchpoint intrinsics."); + // gracefully fallback to X86 C calling convention on Release builds. + return false; +} + +static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList) / sizeof(RegList[0]); + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + +/// X86 interrupt handlers can only take one or two stack arguments, but if +/// there are two arguments, they are in the opposite order from the standard +/// convention. Therefore, we have to look at the argument count up front before +/// allocating stack for each argument. +static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + const MachineFunction &MF = State.getMachineFunction(); + size_t ArgCount = State.getMachineFunction().getFunction().arg_size(); + bool Is64Bit = static_cast<const X86Subtarget &>(MF.getSubtarget()).is64Bit(); + unsigned SlotSize = Is64Bit ? 8 : 4; + unsigned Offset; + if (ArgCount == 1 && ValNo == 0) { + // If we have one argument, the argument is five stack slots big, at fixed + // offset zero. + Offset = State.AllocateStack(5 * SlotSize, 4); + } else if (ArgCount == 2 && ValNo == 0) { + // If we have two arguments, the stack slot is *after* the error code + // argument. Pretend it doesn't consume stack space, and account for it when + // we assign the second argument. + Offset = SlotSize; + } else if (ArgCount == 2 && ValNo == 1) { + // If this is the second of two arguments, it must be the error code. It + // appears first on the stack, and is then followed by the five slot + // interrupt struct. + Offset = 0; + (void)State.AllocateStack(6 * SlotSize, 4); + } else { + report_fatal_error("unsupported x86 interrupt prototype"); + } + + // FIXME: This should be accounted for in + // X86FrameLowering::getFrameIndexReference, not here. + if (Is64Bit && ArgCount == 2) + Offset += SlotSize; + + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return true; +} + +// Provides entry points of CC_X86 and RetCC_X86. +#include "X86GenCallingConv.inc" diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index d0fcbd313312..191e0fa619b2 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -1,9 +1,8 @@ //=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,99 +20,12 @@ namespace llvm { -/// When regcall calling convention compiled to 32 bit arch, special treatment -/// is required for 64 bit masks. -/// The value should be assigned to two GPRs. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -/// Vectorcall calling convention has special handling for vector types or -/// HVA for 64 bit arch. -/// For HVAs shadow registers might be allocated on the first pass -/// and actual XMM registers are allocated on the second pass. -/// For vector types, actual XMM registers are allocated on the first pass. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -/// Vectorcall calling convention has special handling for vector types or -/// HVA for 32 bit arch. -/// For HVAs actual XMM registers are allocated on the second pass. -/// For vector types, actual XMM registers are allocated on the first pass. -/// \return true if registers were allocated and false otherwise. -bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State); - -inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, - CCValAssign::LocInfo &, ISD::ArgFlagsTy &, - CCState &) { - llvm_unreachable("The AnyReg calling convention is only supported by the " \ - "stackmap and patchpoint intrinsics."); - // gracefully fallback to X86 C calling convention on Release builds. - return false; -} - -inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure - // not to split i64 and double between a register and stack - static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; - static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); - - SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); - - // If this is the first part of an double/i64/i128, or if we're already - // in the middle of a split, add to the pending list. If this is not - // the end of the split, return, otherwise go on to process the pending - // list - if (ArgFlags.isSplit() || !PendingMembers.empty()) { - PendingMembers.push_back( - CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); - if (!ArgFlags.isSplitEnd()) - return true; - } - - // If there are no pending members, we are not in the middle of a split, - // so do the usual inreg stuff. - if (PendingMembers.empty()) { - if (unsigned Reg = State.AllocateReg(RegList)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return true; - } - return false; - } - - assert(ArgFlags.isSplitEnd()); - - // We now have the entire original argument in PendingMembers, so decide - // whether to use registers or the stack. - // Per the MCU ABI: - // a) To use registers, we need to have enough of them free to contain - // the entire argument. - // b) We never want to use more than 2 registers for a single argument. - - unsigned FirstFree = State.getFirstUnallocated(RegList); - bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); - - for (auto &It : PendingMembers) { - if (UseRegs) - It.convertToReg(State.AllocateReg(RegList[FirstFree++])); - else - It.convertToMem(State.AllocateStack(4, 4)); - State.addLoc(It); - } - - PendingMembers.clear(); +bool RetCC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State); - return true; -} +bool CC_X86(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index fe49c9ffbd95..1c3034a5116a 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -1,9 +1,8 @@ //===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -148,7 +147,8 @@ def CC_#NAME : CallingConv<[ CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; def RetCC_#NAME : CallingConv<[ @@ -477,6 +477,7 @@ def RetCC_X86_64 : CallingConv<[ ]>; // This is the return-value convention used for the entire X86 backend. +let Entry = 1 in def RetCC_X86 : CallingConv<[ // Check if this is the Intel OpenCL built-ins calling convention @@ -567,7 +568,7 @@ def CC_X86_64_C : CallingConv<[ CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -612,7 +613,7 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, // 512 bit vectors are passed by pointer - CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect<i64>>, @@ -985,14 +986,6 @@ def CC_Intel_OCL_BI : CallingConv<[ CCDelegateTo<CC_X86_32_C> ]>; -def CC_X86_32_Intr : CallingConv<[ - CCAssignToStack<4, 4> -]>; - -def CC_X86_64_Intr : CallingConv<[ - CCAssignToStack<8, 8> -]>; - //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// @@ -1001,7 +994,7 @@ def CC_X86_64_Intr : CallingConv<[ def CC_X86_32 : CallingConv<[ // X86_INTR calling convention is valid in MCU target and should override the // MCU calling convention. Thus, this should be checked before isTargetMCU(). - CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, + CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>, CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, @@ -1029,7 +1022,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_RegCall", CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>, - CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, + CCIfCC<"CallingConv::X86_INTR", CCCustom<"CC_X86_Intr">>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -1039,6 +1032,7 @@ def CC_X86_64 : CallingConv<[ ]>; // This is the argument convention used for the entire X86 backend. +let Entry = 1 in def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, diff --git a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp index c3e76fd2a856..a61fa3246f09 100644 --- a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -1,9 +1,8 @@ //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -102,9 +101,7 @@ namespace { /// Converts X86 cmov instructions into branches when profitable. class X86CmovConverterPass : public MachineFunctionPass { public: - X86CmovConverterPass() : MachineFunctionPass(ID) { - initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry()); - } + X86CmovConverterPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 cmov Conversion"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -281,7 +278,8 @@ bool X86CmovConverterPass::collectCmovCandidates( Group.clear(); // Condition code of first CMOV instruction current processed range and its // opposite condition code. - X86::CondCode FirstCC, FirstOppCC, MemOpCC; + X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID, + MemOpCC = X86::COND_INVALID; // Indicator of a non CMOVrr instruction in the current processed range. bool FoundNonCMOVInst = false; // Indicator for current processed CMOV-group if it should be skipped. @@ -291,7 +289,7 @@ bool X86CmovConverterPass::collectCmovCandidates( // Skip debug instructions. if (I.isDebugInstr()) continue; - X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); + X86::CondCode CC = X86::getCondFromCMov(I); // Check if we found a X86::CMOVrr instruction. if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) { if (Group.empty()) { @@ -546,7 +544,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( } unsigned CondCost = - DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; + DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth; unsigned ValCost = getDepthOfOptCmov( DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); @@ -594,7 +592,7 @@ static bool checkEFLAGSLive(MachineInstr *MI) { /// move all debug instructions to after the last CMOV instruction, making the /// CMOV group consecutive. static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { - assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID && "Last instruction in a CMOV group must be a CMOV instruction"); SmallVector<MachineInstr *, 2> DBGInstructions; @@ -652,14 +650,14 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MachineInstr *LastCMOV = Group.back(); DebugLoc DL = MI.getDebugLoc(); - X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI)); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); // Potentially swap the condition codes so that any memory operand to a CMOV // is in the *false* position instead of the *true* position. We can invert // any non-memory operand CMOV instructions to cope with this and we ensure // memory operand CMOVs are only included with a single condition code. if (llvm::any_of(Group, [&](MachineInstr *I) { - return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC; + return I->mayLoad() && X86::getCondFromCMov(*I) == CC; })) std::swap(CC, OppCC); @@ -690,7 +688,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. - BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); + BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // Add the sink block to the false block successors. FalseMBB->addSuccessor(SinkMBB); @@ -713,8 +711,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( if (!MI.mayLoad()) { // Remember the false-side register input. unsigned FalseReg = - MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2) - .getReg(); + MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); // Walk back through any intermediate cmovs referenced. while (true) { auto FRIt = FalseBBRegRewriteTable.find(FalseReg); @@ -729,7 +726,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // The condition must be the *opposite* of the one we've decided to branch // on as the branch will go *around* the load and the load should happen // when the CMOV condition is false. - assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC && + assert(X86::getCondFromCMov(MI) == OppCC && "Can only handle memory-operand cmov instructions with a condition " "opposite to the selected branch direction."); @@ -768,7 +765,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Move the new CMOV to just before the old one and reset any impacted // iterator. auto *NewCMOV = NewMIs.pop_back_val(); - assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC && + assert(X86::getCondFromCMov(*NewCMOV) == OppCC && "Last new instruction isn't the expected CMOV!"); LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); @@ -820,7 +817,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // If this CMOV we are processing is the opposite condition from the jump we // generated, then we have to swap the operands for the PHI that is going to // be generated. - if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) + if (X86::getCondFromCMov(*MIIt) == OppCC) std::swap(Op1Reg, Op2Reg); auto Op1Itr = RegRewriteTable.find(Op1Reg); diff --git a/contrib/llvm/lib/Target/X86/X86CondBrFolding.cpp b/contrib/llvm/lib/Target/X86/X86CondBrFolding.cpp index 7ce443c4656a..9dea94f1368d 100644 --- a/contrib/llvm/lib/Target/X86/X86CondBrFolding.cpp +++ b/contrib/llvm/lib/Target/X86/X86CondBrFolding.cpp @@ -1,9 +1,8 @@ //===---- X86CondBrFolding.cpp - optimize conditional branches ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file defines a pass that optimizes condition branches on x86 by taking @@ -62,9 +61,7 @@ STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded"); namespace { class X86CondBrFoldingPass : public MachineFunctionPass { public: - X86CondBrFoldingPass() : MachineFunctionPass(ID) { - initializeX86CondBrFoldingPassPass(*PassRegistry::getPassRegistry()); - } + X86CondBrFoldingPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 CondBr Folding"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -226,10 +223,9 @@ void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB, MachineInstr *BrMI; if (MBBInfo->TBB == OrigDest) { BrMI = MBBInfo->BrInstr; - unsigned JNCC = GetCondBranchFromCond(MBBInfo->BranchCode); MachineInstrBuilder MIB = - BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(JNCC)) - .addMBB(NewDest); + BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1)) + .addMBB(NewDest).addImm(MBBInfo->BranchCode); MBBInfo->TBB = NewDest; MBBInfo->BrInstr = MIB.getInstr(); } else { // Should be the unconditional jump stmt. @@ -255,8 +251,8 @@ void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) { MachineInstr *BrMI = MBBInfo->BrInstr; X86::CondCode CC = MBBInfo->BranchCode; MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), - TII->get(GetCondBranchFromCond(CC))) - .addMBB(MBBInfo->TBB); + TII->get(X86::JCC_1)) + .addMBB(MBBInfo->TBB).addImm(CC); BrMI->eraseFromParent(); MBBInfo->BrInstr = MIB.getInstr(); @@ -324,8 +320,8 @@ void X86CondBrFolding::optimizeCondBr( llvm_unreachable("unexpected condtional code."); } BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(GetCondBranchFromCond(NewCC))) - .addMBB(RootMBBInfo->FBB); + TII->get(X86::JCC_1)) + .addMBB(RootMBBInfo->FBB).addImm(NewCC); // RootMBB: Jump to TargetMBB BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), @@ -513,7 +509,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { if (I->isBranch()) { if (TBB) return nullptr; - CC = X86::getCondFromBranchOpc(I->getOpcode()); + CC = X86::getCondFromBranch(*I); switch (CC) { default: return nullptr; diff --git a/contrib/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/contrib/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp index 6bee20b617dd..7051550d52e6 100644 --- a/contrib/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp +++ b/contrib/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -1,9 +1,8 @@ //===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -35,6 +34,14 @@ static cl::opt<bool> EnableDiscriminateMemops( "the build of the binary consuming the profile."), cl::Hidden); +static cl::opt<bool> BypassPrefetchInstructions( + "x86-bypass-prefetch-instructions", cl::init(true), + cl::desc("When discriminating instructions with memory operands, ignore " + "prefetch instructions. This ensures the other memory operand " + "instructions have the same identifiers after inserting " + "prefetches, allowing for successive insertions."), + cl::Hidden); + namespace { using Location = std::pair<StringRef, unsigned>; @@ -63,6 +70,10 @@ public: X86DiscriminateMemOps(); }; +bool IsPrefetchOpcode(unsigned Opcode) { + return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 || + Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2; +} } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -86,7 +97,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { // have any debug info. const DILocation *ReferenceDI = DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI); - + assert(ReferenceDI && "ReferenceDI should not be nullptr"); DenseMap<Location, unsigned> MemOpDiscriminators; MemOpDiscriminators[diToLocation(ReferenceDI)] = 0; @@ -99,6 +110,8 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { const auto &DI = MI.getDebugLoc(); if (!DI) continue; + if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) + continue; Location Loc = diToLocation(DI); MemOpDiscriminators[Loc] = std::max(MemOpDiscriminators[Loc], DI->getBaseDiscriminator()); @@ -115,15 +128,18 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { for (auto &MI : MBB) { if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0) continue; + if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode)) + continue; const DILocation *DI = MI.getDebugLoc(); - if (!DI) { + bool HasDebug = DI; + if (!HasDebug) { DI = ReferenceDI; } Location L = diToLocation(DI); DenseSet<unsigned> &Set = Seen[L]; const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert = Set.insert(DI->getBaseDiscriminator()); - if (!TryInsert.second) { + if (!TryInsert.second || !HasDebug) { unsigned BF, DF, CI = 0; DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI); Optional<unsigned> EncodedDiscriminator = DILocation::encodeDiscriminator( @@ -144,6 +160,7 @@ bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) { // Since we were able to encode, bump the MemOpDiscriminators. ++MemOpDiscriminators[L]; DI = DI->cloneWithDiscriminator(EncodedDiscriminator.getValue()); + assert(DI && "DI should not be nullptr"); updateDebugInfo(&MI, DI); Changed = true; std::pair<DenseSet<unsigned>::iterator, bool> MustInsert = diff --git a/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp index d9ebbb506ca4..18bbfa32e11b 100644 --- a/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -1,9 +1,8 @@ //===--- X86DomainReassignment.cpp - Selectively switch register classes---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -387,9 +386,7 @@ class X86DomainReassignment : public MachineFunctionPass { public: static char ID; - X86DomainReassignment() : MachineFunctionPass(ID) { - initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry()); - } + X86DomainReassignment() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -557,6 +554,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { // Register already in this closure. if (!C.insertEdge(CurReg)) continue; + EnclosedEdges.insert(Reg); MachineInstr *DefMI = MRI->getVRegDef(CurReg); encloseInstr(C, DefMI); diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp index 80674c7251fe..58680f1815bb 100755 --- a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -1,10 +1,9 @@ //===- X86EvexToVex.cpp ---------------------------------------------------===// // Compress EVEX instructions to VEX encoding when possible to reduce code size // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,15 +12,15 @@ /// are encoded using the EVEX prefix and if possible replaces them by their /// corresponding VEX encoding which is usually shorter by 2 bytes. /// EVEX instructions may be encoded via the VEX prefix when the AVX-512 -/// instruction has a corresponding AVX/AVX2 opcode and when it does not -/// use the xmm or the mask registers or xmm/ymm registers with indexes -/// higher than 15. +/// instruction has a corresponding AVX/AVX2 opcode, when vector length +/// accessed by instruction is less than 512 bits and when it does not use +// the xmm or the mask registers or xmm/ymm registers with indexes higher than 15. /// The pass applies code reduction on the generated code for AVX-512 instrs. // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86InstComments.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" @@ -69,9 +68,7 @@ class EvexToVexInstPass : public MachineFunctionPass { public: static char ID; - EvexToVexInstPass() : MachineFunctionPass(ID) { - initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry()); - } + EvexToVexInstPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return EVEX2VEX_DESC; } @@ -255,7 +252,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable) : makeArrayRef(X86EvexToVex128CompressTable); - auto I = std::lower_bound(Table.begin(), Table.end(), MI.getOpcode()); + auto I = llvm::lower_bound(Table, MI.getOpcode()); if (I == Table.end() || I->EvexOpcode != MI.getOpcode()) return false; diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 1dd73163080b..b8624b40f2f7 100644 --- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -1,9 +1,8 @@ //===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,6 +26,7 @@ using namespace llvm; #define DEBUG_TYPE "x86-pseudo" +#define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass" namespace { class X86ExpandPseudo : public MachineFunctionPass { @@ -66,8 +66,12 @@ private: bool ExpandMBB(MachineBasicBlock &MBB); }; char X86ExpandPseudo::ID = 0; + } // End anonymous namespace. +INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false, + false) + void X86ExpandPseudo::ExpandICallBranchFunnel( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) { MachineBasicBlock *JTMBB = MBB; @@ -83,6 +87,8 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal(); auto CmpTarget = [&](unsigned Target) { + if (Selector.isReg()) + MBB->addLiveIn(Selector.getReg()); BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11) .addReg(X86::RIP) .addImm(1) @@ -98,11 +104,13 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( auto CreateMBB = [&]() { auto *NewMBB = MF->CreateMachineBasicBlock(BB); MBB->addSuccessor(NewMBB); + if (!MBB->isLiveIn(X86::EFLAGS)) + MBB->addLiveIn(X86::EFLAGS); return NewMBB; }; - auto EmitCondJump = [&](unsigned Opcode, MachineBasicBlock *ThenMBB) { - BuildMI(*MBB, MBBI, DL, TII->get(Opcode)).addMBB(ThenMBB); + auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) { + BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC); auto *ElseMBB = CreateMBB(); MF->insert(InsPt, ElseMBB); @@ -110,10 +118,10 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( MBBI = MBB->end(); }; - auto EmitCondJumpTarget = [&](unsigned Opcode, unsigned Target) { + auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) { auto *ThenMBB = CreateMBB(); TargetMBBs.push_back({ThenMBB, Target}); - EmitCondJump(Opcode, ThenMBB); + EmitCondJump(CC, ThenMBB); }; auto EmitTailCall = [&](unsigned Target) { @@ -130,23 +138,23 @@ void X86ExpandPseudo::ExpandICallBranchFunnel( if (NumTargets == 2) { CmpTarget(FirstTarget + 1); - EmitCondJumpTarget(X86::JB_1, FirstTarget); + EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitTailCall(FirstTarget + 1); return; } if (NumTargets < 6) { CmpTarget(FirstTarget + 1); - EmitCondJumpTarget(X86::JB_1, FirstTarget); - EmitCondJumpTarget(X86::JE_1, FirstTarget + 1); + EmitCondJumpTarget(X86::COND_B, FirstTarget); + EmitCondJumpTarget(X86::COND_E, FirstTarget + 1); EmitBranchFunnel(FirstTarget + 2, NumTargets - 2); return; } auto *ThenMBB = CreateMBB(); CmpTarget(FirstTarget + (NumTargets / 2)); - EmitCondJump(X86::JB_1, ThenMBB); - EmitCondJumpTarget(X86::JE_1, FirstTarget + (NumTargets / 2)); + EmitCondJump(X86::COND_B, ThenMBB); + EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2)); EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1, NumTargets - (NumTargets / 2) - 1); @@ -254,16 +262,19 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, for (unsigned i = 0; i != 5; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { + JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) - .addReg(JumpTarget.getReg(), RegState::Kill); + .add(JumpTarget); } else { + JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) - .addReg(JumpTarget.getReg(), RegState::Kill); + .add(JumpTarget); } MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); + MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index 12cd613c34cb..7b9ce0271205 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -1,9 +1,8 @@ //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -85,7 +84,7 @@ private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, const DebugLoc &DL); - bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, + bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment = 1); bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, @@ -290,7 +289,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, } bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { - EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); + EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) // Unhandled type. Halt "fast" selection and bail. return false; @@ -312,12 +311,10 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); } -#include "X86GenCallingConv.inc" - /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. -bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, +bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { bool HasSSE41 = Subtarget->hasSSE41(); @@ -327,46 +324,42 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Treat i1 loads the same as i8 loads. Masking will be done when storing. + if (VT == MVT::i1) + VT = MVT::i8; + // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return false; - case MVT::i1: case MVT::i8: Opc = X86::MOV8rm; - RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; - RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; - RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; - RC = &X86::GR64RegClass; break; case MVT::f32: - if (X86ScalarSSEf32) { - Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; - RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + if (X86ScalarSSEf32) + Opc = HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt; + else Opc = X86::LD_Fp32m; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { - Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; - RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + if (X86ScalarSSEf64) + Opc = HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt; + else Opc = X86::LD_Fp64m; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. @@ -381,7 +374,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) @@ -393,7 +385,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPDZ128rm : HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v4i32: case MVT::v2i64: @@ -408,7 +399,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVDQU64Z128rm : HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; - RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); @@ -420,7 +410,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); @@ -432,7 +421,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v8i32: case MVT::v4i64: @@ -447,7 +435,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; - RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v16f32: assert(HasAVX512); @@ -455,7 +442,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; - RC = &X86::VR512RegClass; break; case MVT::v8f64: assert(HasAVX512); @@ -463,7 +449,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; - RC = &X86::VR512RegClass; break; case MVT::v8i64: case MVT::v16i32: @@ -476,10 +461,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = X86::VMOVNTDQAZrm; else Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm; - RC = &X86::VR512RegClass; break; } + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -1483,8 +1469,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - { X86::SETEr, X86::SETNPr, X86::AND8rr }, - { X86::SETNEr, X86::SETPr, X86::OR8rr } + { X86::COND_E, X86::COND_NP, X86::AND8rr }, + { X86::COND_NE, X86::COND_P, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { @@ -1500,10 +1486,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), - FlagReg1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), - FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg1).addImm(SETFOpc[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg2).addImm(SETFOpc[1]); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); updateValueMap(I, ResultReg); @@ -1514,7 +1500,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LHS, RHS); @@ -1523,7 +1508,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + ResultReg).addImm(CC); updateValueMap(I, ResultReg); return true; } @@ -1693,11 +1679,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } bool SwapArgs; - unsigned BranchOpc; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) std::swap(CmpLHS, CmpRHS); @@ -1705,14 +1689,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(CC); // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(X86::COND_P); } finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); @@ -1739,14 +1723,14 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); - unsigned JmpOpc = X86::JNE_1; + unsigned JmpCond = X86::COND_NE; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); - JmpOpc = X86::JE_1; + JmpCond = X86::COND_E; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(JmpCond); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; @@ -1759,10 +1743,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (TmpReg == 0) return false; - unsigned BranchOpc = X86::GetCondBranchFromCond(CC); - - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(CC); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1786,8 +1768,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg) .addImm(1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) - .addMBB(TrueMBB); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JCC_1)) + .addMBB(TrueMBB).addImm(X86::COND_NE); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -2050,8 +2032,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - { X86::SETNPr, X86::SETEr , X86::TEST8rr }, - { X86::SETPr, X86::SETNEr, X86::OR8rr } + { X86::COND_NP, X86::COND_E, X86::TEST8rr }, + { X86::COND_P, X86::COND_NE, X86::OR8rr } }; const uint16_t *SETFOpc = nullptr; switch (Predicate) { @@ -2083,10 +2065,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), - FlagReg1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), - FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg1).addImm(SETFOpc[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + FlagReg2).addImm(SETFOpc[1]); auto const &II = TII.get(SETFOpc[2]); if (II.getNumDefs()) { unsigned TmpReg = createResultReg(&X86::GR8RegClass); @@ -2147,9 +2129,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); - unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8); - unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, - LHSReg, LHSIsKill); + unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8); + unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; } @@ -2194,19 +2176,6 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - // Choose the SSE instruction sequence based on data type (float or double). - static const uint16_t OpcTable[2][4] = { - { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, - { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } - }; - - const uint16_t *Opc = nullptr; - switch (RetVT.SimpleTy) { - default: return false; - case MVT::f32: Opc = &OpcTable[0][0]; break; - case MVT::f64: Opc = &OpcTable[1][0]; break; - } - const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); @@ -2277,6 +2246,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { + // Choose the SSE instruction sequence based on data type (float or double). + static const uint16_t OpcTable[2][4] = { + { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, + { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } + }; + + const uint16_t *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; + } + const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); @@ -2303,8 +2285,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; case MVT::i32: Opc = X86::CMOV_GR32; break; - case MVT::f32: Opc = X86::CMOV_FR32; break; - case MVT::f64: Opc = X86::CMOV_FR64; break; + case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X + : X86::CMOV_FR32; break; + case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X + : X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); @@ -2485,13 +2469,14 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, assert((I->getOpcode() == Instruction::FPExt || I->getOpcode() == Instruction::FPTrunc) && "Instruction must be an FPExt or FPTrunc!"); + bool HasAVX = Subtarget->hasAVX(); unsigned OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; unsigned ImplicitDefReg; - if (Subtarget->hasAVX()) { + if (HasAVX) { ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); @@ -2503,7 +2488,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), ResultReg); - if (Subtarget->hasAVX()) + if (HasAVX) MIB.addReg(ImplicitDefReg); MIB.addReg(OpReg); @@ -2519,8 +2504,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { unsigned Opc = HasAVX512 ? X86::VCVTSS2SDZrr : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; - return X86SelectFPExtOrFPTrunc( - I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass); + return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64)); } return false; @@ -2534,8 +2518,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { unsigned Opc = HasAVX512 ? X86::VCVTSD2SSZrr : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; - return X86SelectFPExtOrFPTrunc( - I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass); + return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32)); } return false; @@ -2900,21 +2883,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { isCommutativeIntrinsic(II)) std::swap(LHS, RHS); - unsigned BaseOpc, CondOpc; + unsigned BaseOpc, CondCode; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + BaseOpc = ISD::ADD; CondCode = X86::COND_O; break; case Intrinsic::uadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + BaseOpc = ISD::ADD; CondCode = X86::COND_B; break; case Intrinsic::ssub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + BaseOpc = ISD::SUB; CondCode = X86::COND_O; break; case Intrinsic::usub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + BaseOpc = ISD::SUB; CondCode = X86::COND_B; break; case Intrinsic::smul_with_overflow: - BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break; case Intrinsic::umul_with_overflow: - BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break; } unsigned LHSReg = getRegForValue(LHS); @@ -2931,7 +2914,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { }; if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) && - CondOpc == X86::SETOr) { + CondCode == X86::COND_O) { // We can use INC/DEC. ResultReg = createResultReg(TLI.getRegClassFor(VT)); bool IsDec = BaseOpc == ISD::SUB; @@ -2990,8 +2973,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // Assign to a GPR since the overflow return value is lowered to a SETcc. unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), - ResultReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), + ResultReg2).addImm(CondCode); updateValueMap(II, ResultReg, 2); return true; @@ -3509,8 +3492,9 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // This will be a direct call, or an indirect call through memory for // NonLazyBind calls or dllimport calls. - bool NeedLoad = - OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL; + bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT || + OpFlags == X86II::MO_GOTPCREL || + OpFlags == X86II::MO_COFFSTUB; unsigned CallOpc = NeedLoad ? (Is64Bit ? X86::CALL64m : X86::CALL32m) : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); @@ -3595,7 +3579,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)), FI) .addReg(CopyReg); - Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt; addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg + i), FI); } @@ -3662,24 +3646,19 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return true; } case Instruction::BitCast: { - // Select SSE2/AVX bitcasts between 128/256 bit vector types. + // Select SSE2/AVX bitcasts between 128/256/512 bit vector types. if (!Subtarget->hasSSE2()) return false; - EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); - EVT DstVT = TLI.getValueType(DL, I->getType()); - - if (!SrcVT.isSimple() || !DstVT.isSimple()) + MVT SrcVT, DstVT; + if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) || + !isTypeLegal(I->getType(), DstVT)) return false; - MVT SVT = SrcVT.getSimpleVT(); - MVT DVT = DstVT.getSimpleVT(); - - if (!SVT.is128BitVector() && - !(Subtarget->hasAVX() && SVT.is256BitVector()) && - !(Subtarget->hasAVX512() && SVT.is512BitVector() && - (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 && - DVT.getScalarSizeInBits() >= 32)))) + // Only allow vectors that use xmm/ymm/zmm. + if (!SrcVT.isVector() || !DstVT.isVector() || + SrcVT.getVectorElementType() == MVT::i1 || + DstVT.getVectorElementType() == MVT::i1) return false; unsigned Reg = getRegForValue(I->getOperand(0)); @@ -3757,30 +3736,25 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; + bool HasAVX = Subtarget->hasAVX(); + bool HasAVX512 = Subtarget->hasAVX512(); switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) { - Opc = Subtarget->hasAVX512() - ? X86::VMOVSSZrm - : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; - RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + if (X86ScalarSSEf32) + Opc = HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt; + else Opc = X86::LD_Fp32m; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { - Opc = Subtarget->hasAVX512() - ? X86::VMOVSDZrm - : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; - RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + if (X86ScalarSSEf64) + Opc = HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt; + else Opc = X86::LD_Fp64m; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. @@ -3806,7 +3780,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Create the load from the constant pool. unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); - unsigned ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); if (CM == CodeModel::Large) { unsigned AddrReg = createResultReg(&X86::GR64RegClass); @@ -3916,33 +3890,26 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { // Get opcode and regclass for the given zero. bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) { + if (X86ScalarSSEf32) Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; - RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; - } else { + else Opc = X86::LD_Fp032; - RC = &X86::RFP32RegClass; - } break; case MVT::f64: - if (X86ScalarSSEf64) { + if (X86ScalarSSEf64) Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; - RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; - } else { + else Opc = X86::LD_Fp064; - RC = &X86::RFP64RegClass; - } break; case MVT::f80: // No f80 support yet. return 0; } - unsigned ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); return ResultReg; } @@ -3992,6 +3959,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); + Result->cloneInstrSymbols(*FuncInfo.MF, *MI); MachineBasicBlock::iterator I(MI); removeDeadCode(I, std::next(I)); return true; diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp index ed297e678203..bf541d933790 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -1,9 +1,8 @@ //===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -103,9 +102,7 @@ public: StringRef getPassName() const override { return FIXUPBW_DESC; } - FixupBWInstPass() : MachineFunctionPass(ID) { - initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry()); - } + FixupBWInstPass() : MachineFunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to @@ -151,7 +148,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { this->MF = &MF; TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); - OptForSize = MF.getFunction().optForSize(); + OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis<MachineLoopInfo>(); LiveRegs.init(TII->getRegisterInfo()); diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index a346085a52cb..041529a0be68 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -1,15 +1,14 @@ //===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the pass that finds instructions that can be // re-written as LEA instructions in order to reduce pipeline delays. -// When optimizing for size it replaces suitable LEAs with INC or DEC. +// It replaces LEAs with ADD/INC/DEC when that is better for size/speed. // //===----------------------------------------------------------------------===// @@ -36,31 +35,25 @@ namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - /// Loop over all of the instructions in the basic block - /// replacing applicable instructions with LEA instructions, - /// where appropriate. - bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI, - bool IsSlowLEA, bool IsSlow3OpsLEA); - /// Given a machine register, look for the instruction /// which writes it in the current basic block. If found, /// try to replace it with an equivalent LEA instruction. /// If replacement succeeds, then also process the newly created /// instruction. void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a memory access or LEA instruction /// whose address mode uses a base and/or index register, look for /// an opportunity to replace the instruction which sets the base or index /// register with an equivalent LEA instruction. void processInstruction(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a LEA instruction which is unprofitable /// on SlowLEA targets try to replace it with an equivalent ADD instruction. void processInstructionForSlowLEA(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// Given a LEA instruction which is unprofitable /// on SNB+ try to replace it with other instructions. @@ -75,12 +68,13 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); - /// Look for LEAs that add 1 to reg or subtract 1 from reg - /// and convert them to INC or DEC respectively. - bool fixupIncDec(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) const; + /// Look for LEAs that are really two address LEAs that we might be able to + /// turn into regular ADD instructions. + bool optTwoAddrLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec, + bool UseLEAForSP) const; /// Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. @@ -91,12 +85,12 @@ class FixupLEAPass : public MachineFunctionPass { /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. MachineBasicBlock::iterator searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI); + MachineBasicBlock &MBB); /// if an instruction can be converted to an /// equivalent LEA, insert the new instruction into the basic block /// and return a pointer to it. Otherwise, return zero. - MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineInstr *postRAConvertToLEA(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const; public: @@ -104,9 +98,7 @@ public: StringRef getPassName() const override { return FIXUPLEA_DESC; } - FixupLEAPass() : MachineFunctionPass(ID) { - initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); - } + FixupLEAPass() : MachineFunctionPass(ID) { } /// Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -121,10 +113,8 @@ public: private: TargetSchedModel TSM; - MachineFunction *MF; - const X86InstrInfo *TII; // Machine instruction info. - bool OptIncDec; - bool OptLEA; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; }; } @@ -133,7 +123,7 @@ char FixupLEAPass::ID = 0; INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) MachineInstr * -FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, +FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { @@ -142,7 +132,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, const MachineOperand &Src = MI.getOperand(1); const MachineOperand &Dest = MI.getOperand(0); MachineInstr *NewMI = - BuildMI(*MF, MI.getDebugLoc(), + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) .add(Dest) @@ -151,9 +141,17 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, .addReg(0) .addImm(0) .addReg(0); - MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } + } + + if (!MI.isConvertibleTo3Addr()) + return nullptr; + + switch (MI.getOpcode()) { + default: + // Only convert instructions that we've verified are safe. + return nullptr; case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: @@ -162,52 +160,80 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, case X86::ADD32ri8: case X86::ADD32ri_DB: case X86::ADD32ri8_DB: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri_DB: - case X86::ADD16ri8_DB: if (!MI.getOperand(2).isImm()) { // convertToThreeAddress will call getImm() // which requires isImm() to be true return nullptr; } break; - case X86::ADD16rr: - case X86::ADD16rr_DB: - if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) { - // if src1 != src2, then convertToThreeAddress will - // need to create a Virtual register, which we cannot do - // after register allocation. - return nullptr; - } + case X86::SHL64ri: + case X86::SHL32ri: + case X86::INC64r: + case X86::INC32r: + case X86::DEC64r: + case X86::DEC32r: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: + // These instructions are all fine to convert. + break; } + MachineFunction::iterator MFI = MBB.getIterator(); return TII->convertToThreeAddress(MFI, MI, nullptr); } FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } -bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { - if (skipFunction(Func.getFunction())) +static bool isLEA(unsigned Opcode) { + return Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r; +} + +bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; - MF = &Func; - const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); bool IsSlowLEA = ST.slowLEA(); bool IsSlow3OpsLEA = ST.slow3OpsLEA(); + bool LEAUsesAG = ST.LEAusesAG(); - OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize(); - OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA; - - if (!OptLEA && !OptIncDec) - return false; + bool OptIncDec = !ST.slowIncDec() || MF.getFunction().hasOptSize(); + bool UseLEAForSP = ST.useLeaForSP(); - TSM.init(&Func.getSubtarget()); + TSM.init(&ST); TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";); - // Process all basic blocks. - for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) - processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA); + for (MachineBasicBlock &MBB : MF) { + // First pass. Try to remove or optimize existing LEAs. + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (!isLEA(I->getOpcode())) + continue; + + if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) + continue; + + if (IsSlowLEA) { + processInstructionForSlowLEA(I, MBB); + } else if (IsSlow3OpsLEA) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { + MBB.erase(I); + I = NewMI; + } + } + } + + // Second pass for creating LEAs. This may reverse some of the + // transformations above. + if (LEAUsesAG) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) + processInstruction(I, MBB); + } + } + LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";); return true; @@ -218,7 +244,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { RegUsageState RegUsage = RU_NotUsed; MachineInstr &MI = *I; - for (unsigned int i = 0; i < MI.getNumOperands(); ++i) { + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { MachineOperand &opnd = MI.getOperand(i); if (opnd.isReg() && opnd.getReg() == p.getReg()) { if (opnd.isDef()) @@ -234,10 +260,10 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { /// wrapping around to the last instruction of the block if the block /// branches to itself. static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { - if (I == MFI->begin()) { - if (MFI->isPredecessor(&*MFI)) { - I = --MFI->end(); + MachineBasicBlock &MBB) { + if (I == MBB.begin()) { + if (MBB.isPredecessor(&MBB)) { + I = --MBB.end(); return true; } else return false; @@ -248,14 +274,14 @@ static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { int InstrDistance = 1; MachineBasicBlock::iterator CurInst; static const int INSTR_DISTANCE_THRESHOLD = 5; CurInst = I; bool Found; - Found = getPreviousInstr(CurInst, MFI); + Found = getPreviousInstr(CurInst, MBB); while (Found && I != CurInst) { if (CurInst->isCall() || CurInst->isInlineAsm()) break; @@ -265,17 +291,12 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return CurInst; } InstrDistance += TSM.computeInstrLatency(&*CurInst); - Found = getPreviousInstr(CurInst, MFI); + Found = getPreviousInstr(CurInst, MBB); } return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int Opcode) { - return Opcode == X86::LEA16r || Opcode == X86::LEA32r || - Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; -} - -static inline bool isInefficientLEAReg(unsigned int Reg) { +static inline bool isInefficientLEAReg(unsigned Reg) { return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13D || Reg == X86::R13; } @@ -298,27 +319,24 @@ static inline bool hasLEAOffset(const MachineOperand &Offset) { return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); } -static inline int getADDrrFromLEA(int LEAOpcode) { +static inline unsigned getADDrrFromLEA(unsigned LEAOpcode) { switch (LEAOpcode) { default: llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return X86::ADD16rr; case X86::LEA32r: - return X86::ADD32rr; case X86::LEA64_32r: + return X86::ADD32rr; case X86::LEA64r: return X86::ADD64rr; } } -static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { +static inline unsigned getADDriFromLEA(unsigned LEAOpcode, + const MachineOperand &Offset) { bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); switch (LEAOpcode) { default: llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; case X86::LEA32r: case X86::LEA64_32r: return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; @@ -327,56 +345,110 @@ static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { } } -/// isLEASimpleIncOrDec - Does this LEA have one these forms: -/// lea %reg, 1(%reg) -/// lea %reg, -1(%reg) -static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) { - unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg(); - unsigned DstReg = LEA.getOperand(0).getReg(); - const MachineOperand &AddrDisp = LEA.getOperand(1 + X86::AddrDisp); - return SrcReg == DstReg && - LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && - AddrDisp.isImm() && - (AddrDisp.getImm() == 1 || AddrDisp.getImm() == -1); +static inline unsigned getINCDECFromLEA(unsigned LEAOpcode, bool IsINC) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA32r: + case X86::LEA64_32r: + return IsINC ? X86::INC32r : X86::DEC32r; + case X86::LEA64r: + return IsINC ? X86::INC64r : X86::DEC64r; + } } -bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) const { +bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec, + bool UseLEAForSP) const { MachineInstr &MI = *I; - int Opcode = MI.getOpcode(); - if (!isLEA(Opcode)) + + const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); + const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); + const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); + const MachineOperand &Disp = MI.getOperand(1 + X86::AddrDisp); + const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); + + if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 || + !TII->isSafeToClobberEFLAGS(MBB, I)) return false; - if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { - int NewOpcode; - bool isINC = MI.getOperand(1 + X86::AddrDisp).getImm() == 1; - switch (Opcode) { - case X86::LEA16r: - NewOpcode = isINC ? X86::INC16r : X86::DEC16r; - break; - case X86::LEA32r: - case X86::LEA64_32r: - NewOpcode = isINC ? X86::INC32r : X86::DEC32r; - break; - case X86::LEA64r: - NewOpcode = isINC ? X86::INC64r : X86::DEC64r; - break; - } + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned BaseReg = Base.getReg(); + unsigned IndexReg = Index.getReg(); - MachineInstr *NewMI = - BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1 + X86::AddrBaseReg)); - MFI->erase(I); - I = static_cast<MachineBasicBlock::iterator>(NewMI); - return true; + // Don't change stack adjustment LEAs. + if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP)) + return false; + + // LEA64_32 has 64-bit operands but 32-bit result. + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); } - return false; + + MachineInstr *NewMI = nullptr; + + // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1 + // which can be turned into add %reg2, %reg1 + if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 && + (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpcode = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(IndexReg); + } + } else if (DestReg == BaseReg && IndexReg == 0) { + // This is an LEA with only a base register and a displacement, + // We can use ADDri or INC/DEC. + + // Does this LEA have one these forms: + // lea %reg, 1(%reg) + // lea %reg, -1(%reg) + if (OptIncDec && (Disp.getImm() == 1 || Disp.getImm() == -1)) { + bool IsINC = Disp.getImm() == 1; + unsigned NewOpcode = getINCDECFromLEA(MI.getOpcode(), IsINC); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addReg(Base.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg); + } + } else { + unsigned NewOpcode = getADDriFromLEA(MI.getOpcode(), Disp); + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addImm(Disp.getImm()) + .addReg(Base.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcode), DestReg) + .addReg(BaseReg).addImm(Disp.getImm()); + } + } + } else + return false; + + MBB.erase(I); + I = NewMI; + return true; } void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { // Process a load, store, or LEA instruction. MachineInstr &MI = *I; const MCInstrDesc &Desc = MI.getDesc(); @@ -385,40 +457,38 @@ void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, AddrOffset += X86II::getOperandBias(Desc); MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg); if (p.isReg() && p.getReg() != X86::ESP) { - seekLEAFixup(p, I, MFI); + seekLEAFixup(p, I, MBB); } MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg); if (q.isReg() && q.getReg() != X86::ESP) { - seekLEAFixup(q, I, MFI); + seekLEAFixup(q, I, MBB); } } } void FixupLEAPass::seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { - MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MBB); if (MBI != MachineBasicBlock::iterator()) { - MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI); + MachineInstr *NewMI = postRAConvertToLEA(MBB, MBI); if (NewMI) { ++NumLEAs; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); // now to replace with an equivalent LEA... LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); - MFI->erase(MBI); + MBB.erase(MBI); MachineBasicBlock::iterator J = static_cast<MachineBasicBlock::iterator>(NewMI); - processInstruction(J, MFI); + processInstruction(J, MBB); } } } void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, - MachineFunction::iterator MFI) { + MachineBasicBlock &MBB) { MachineInstr &MI = *I; - const int Opcode = MI.getOpcode(); - if (!isLEA(Opcode)) - return; + const unsigned Opcode = MI.getOpcode(); const MachineOperand &Dst = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); @@ -428,7 +498,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); if (Segment.getReg() != 0 || !Offset.isImm() || - !TII->isSafeToClobberEFLAGS(*MFI, I)) + !TII->isSafeToClobberEFLAGS(MBB, I)) return; const unsigned DstR = Dst.getReg(); const unsigned SrcR1 = Base.getReg(); @@ -445,7 +515,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); const MachineOperand &Src = SrcR1 == DstR ? Index : Base; NewMI = - BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); + BuildMI(MBB, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); LLVM_DEBUG(NewMI->dump();); } // Make ADD instruction for immediate @@ -453,24 +523,21 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(Opcode, Offset)); const MachineOperand &SrcR = SrcR1 == DstR ? Base : Index; - NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(Offset.getImm()); LLVM_DEBUG(NewMI->dump();); } if (NewMI) { - MFI->erase(I); + MBB.erase(I); I = NewMI; } } MachineInstr * FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineFunction::iterator MFI) { - - const int LEAOpcode = MI.getOpcode(); - if (!isLEA(LEAOpcode)) - return nullptr; + MachineBasicBlock &MBB) { + const unsigned LEAOpcode = MI.getOpcode(); const MachineOperand &Dst = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); @@ -481,13 +548,13 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || - !TII->isSafeToClobberEFLAGS(*MFI, MI) || + !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) return nullptr; - unsigned int DstR = Dst.getReg(); - unsigned int BaseR = Base.getReg(); - unsigned int IndexR = Index.getReg(); + unsigned DstR = Dst.getReg(); + unsigned BaseR = Base.getReg(); + unsigned IndexR = Index.getReg(); unsigned SSDstR = (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; bool IsScale1 = Scale.getImm() == 1; @@ -516,11 +583,11 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { const MachineOperand &Src = DstR == BaseR ? Index : Base; MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); LLVM_DEBUG(NewMI->dump();); // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); LLVM_DEBUG(NewMI->dump();); } return NewMI; @@ -530,7 +597,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, // lea offset(%base,%index,scale),%dst => // lea (%base,%index,scale); add offset,%dst if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) .add(Dst) .add(IsInefficientBase ? Index : Base) .add(Scale) @@ -540,7 +607,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, LLVM_DEBUG(NewMI->dump();); // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); LLVM_DEBUG(NewMI->dump();); } return NewMI; @@ -552,17 +619,17 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, BIK); + TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); MachineInstr *NewMI = - BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); LLVM_DEBUG(NewMI->dump();); return NewMI; } // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) .add(Dst) .addReg(0) .add(Scale) @@ -571,35 +638,7 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); LLVM_DEBUG(NewMI->dump();); return NewMI; } - -bool FixupLEAPass::processBasicBlock(MachineFunction &MF, - MachineFunction::iterator MFI, - bool IsSlowLEA, bool IsSlow3OpsLEA) { - for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (OptIncDec) - if (fixupIncDec(I, MFI)) - continue; - - if (OptLEA) { - if (IsSlowLEA) { - processInstructionForSlowLEA(I, MFI); - continue; - } - - if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { - MFI->erase(I); - I = NewMI; - } - continue; - } - - processInstruction(I, MFI); - } - } - return false; -} diff --git a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp index a86eb997635e..e2d4d1ede6f3 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -1,9 +1,8 @@ //===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -68,30 +67,6 @@ char X86FixupSetCCPass::ID = 0; FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); } -bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) { - switch (Opcode) { - default: - return false; - case X86::SETOr: - case X86::SETNOr: - case X86::SETBr: - case X86::SETAEr: - case X86::SETEr: - case X86::SETNEr: - case X86::SETBEr: - case X86::SETAr: - case X86::SETSr: - case X86::SETNSr: - case X86::SETPr: - case X86::SETNPr: - case X86::SETLr: - case X86::SETGEr: - case X86::SETLEr: - case X86::SETGr: - return true; - } -} - // We expect the instruction *immediately* before the setcc to imp-def // EFLAGS (because of scheduling glue). To make this less brittle w.r.t // scheduling, look backwards until we hit the beginning of the @@ -103,7 +78,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, auto MBBStart = MBB->rend(); for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI) for (auto &Op : MI->implicit_operands()) - if ((Op.getReg() == X86::EFLAGS) && (Op.isDef())) + if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef()) return &*MI; return nullptr; @@ -111,7 +86,7 @@ X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) { for (auto &Op : MI->implicit_operands()) - if ((Op.getReg() == X86::EFLAGS) && (Op.isUse())) + if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse()) return true; return false; @@ -129,7 +104,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { // Find a setcc that is used by a zext. // This doesn't have to be the only use, the transformation is safe // regardless. - if (!isSetCCr(MI.getOpcode())) + if (MI.getOpcode() != X86::SETCCr) continue; MachineInstr *ZExt = nullptr; diff --git a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index 778aa505b2d9..5ce3255ea96a 100644 --- a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -1,9 +1,8 @@ //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -71,12 +70,6 @@ STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); STATISTIC(NumTestsInserted, "Number of test instructions inserted"); STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); -namespace llvm { - -void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); - -} // end namespace llvm - namespace { // Convenient array type for storing registers associated with each condition. @@ -84,9 +77,7 @@ using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; class X86FlagsCopyLoweringPass : public MachineFunctionPass { public: - X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { - initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); - } + X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -252,13 +243,13 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, "Split instruction must be in the split block!"); assert(SplitI.isBranch() && "Only designed to split a tail of branch instructions!"); - assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromBranch(SplitI) != X86::COND_INVALID && "Must split on an actual jCC instruction!"); // Dig out the previous instruction to the split point. MachineInstr &PrevI = *std::prev(SplitI.getIterator()); assert(PrevI.isBranch() && "Must split after a branch!"); - assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && + assert(X86::getCondFromBranch(PrevI) != X86::COND_INVALID && "Must split after an actual jCC instruction!"); assert(!std::prev(PrevI.getIterator())->isTerminator() && "Must only have this one terminator prior to the split!"); @@ -588,22 +579,21 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // branch folding or black placement. As a consequence, we get to deal // with the simpler formulation of conditional branches followed by tail // calls. - if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { + if (X86::getCondFromBranch(MI) != X86::COND_INVALID) { auto JmpIt = MI.getIterator(); do { JmpIs.push_back(&*JmpIt); ++JmpIt; } while (JmpIt != UseMBB.instr_end() && - X86::getCondFromBranchOpc(JmpIt->getOpcode()) != + X86::getCondFromBranch(*JmpIt) != X86::COND_INVALID); break; } // Otherwise we can just rewrite in-place. - if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { + if (X86::getCondFromCMov(MI) != X86::COND_INVALID) { rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); - } else if (X86::getCondFromSETOpc(MI.getOpcode()) != - X86::COND_INVALID) { + } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) { rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { rewriteCopy(MI, *FlagUse, CopyDefI); @@ -730,7 +720,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( // Scan backwards across the range of instructions with live EFLAGS. for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { - X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); + X86::CondCode Cond = X86::getCondFromSETCC(MI); if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && TRI->isVirtualRegister(MI.getOperand(0).getReg())) { assert(MI.getOperand(0).isDef() && @@ -751,7 +741,7 @@ unsigned X86FlagsCopyLoweringPass::promoteCondToReg( DebugLoc TestLoc, X86::CondCode Cond) { unsigned Reg = MRI->createVirtualRegister(PromoteRC); auto SetI = BuildMI(TestMBB, TestPos, TestLoc, - TII->get(X86::getSETFromCond(Cond)), Reg); + TII->get(X86::SETCCr), Reg).addImm(Cond); (void)SetI; LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); ++NumSetCCsInserted; @@ -842,7 +832,7 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, MachineOperand &FlagUse, CondRegArray &CondRegs) { // First get the register containing this specific condition. - X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); + X86::CondCode Cond = X86::getCondFromCMov(CMovI); unsigned CondReg; bool Inverted; std::tie(CondReg, Inverted) = @@ -853,12 +843,10 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, // Insert a direct test of the saved register. insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); - // Rewrite the CMov to use the !ZF flag from the test (but match register - // size and memory operand), and then kill its use of the flags afterward. - auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); - CMovI.setDesc(TII->get(X86::getCMovFromCond( - Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, - !CMovI.memoperands_empty()))); + // Rewrite the CMov to use the !ZF flag from the test, and then kill its use + // of the flags afterward. + CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1) + .setImm(Inverted ? X86::COND_E : X86::COND_NE); FlagUse.setIsKill(true); LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); } @@ -867,7 +855,7 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { // First get the register containing this specific condition. - X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); + X86::CondCode Cond = X86::getCondFromBranch(JmpI); unsigned CondReg; bool Inverted; std::tie(CondReg, Inverted) = @@ -880,10 +868,8 @@ void X86FlagsCopyLoweringPass::rewriteCondJmp( // Rewrite the jump to use the !ZF flag from the test, and kill its use of // flags afterward. - JmpI.setDesc(TII->get( - X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); - const int ImplicitEFLAGSOpIdx = 1; - JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); + JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE); + JmpI.findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); } @@ -1026,7 +1012,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, MachineInstr &SetCCI, MachineOperand &FlagUse, CondRegArray &CondRegs) { - X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); + X86::CondCode Cond = X86::getCondFromSETCC(SetCCI); // Note that we can't usefully rewrite this to the inverse without complex // analysis of the users of the setCC. Largely we rely on duplicates which // could have been avoided already being avoided here. diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index f330acff61a1..074cf21d03f5 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -1,9 +1,8 @@ //===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -60,7 +59,6 @@ namespace { struct FPS : public MachineFunctionPass { static char ID; FPS() : MachineFunctionPass(ID) { - initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); // This is really only to keep valgrind quiet. // The logic in isLive() is too much for it. memset(Stack, 0, sizeof(Stack)); @@ -299,9 +297,16 @@ namespace { void setKillFlags(MachineBasicBlock &MBB) const; }; - char FPS::ID = 0; } +char FPS::ID = 0; + +INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier", + false, false) +INITIALIZE_PASS_DEPENDENCY(EdgeBundles) +INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier", + false, false) + FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } /// getFPReg - Return the X86::FPx register number for the specified operand. @@ -591,7 +596,7 @@ namespace { } static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { - const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + const TableEntry *I = llvm::lower_bound(Table, Opcode); if (I != Table.end() && I->from == Opcode) return I->to; return -1; @@ -1096,6 +1101,8 @@ void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { // Change from the pseudo instruction to the concrete instruction. MI.RemoveOperand(0); // Remove the explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); + MI.addOperand( + MachineOperand::CreateReg(X86::ST0, /*isDef*/ true, /*isImp*/ true)); // Result gets pushed on the stack. pushReg(DestReg); @@ -1140,6 +1147,8 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { // Convert from the pseudo instruction to the concrete instruction. MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode()))); + MI.addOperand( + MachineOperand::CreateReg(X86::ST0, /*isDef*/ false, /*isImp*/ true)); if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m || MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m || @@ -1369,8 +1378,6 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { /// register arguments and no explicit destinations. /// void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { - ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); - ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); MachineInstr &MI = *I; unsigned NumOperands = MI.getDesc().getNumOperands(); @@ -1475,7 +1482,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { break; } - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { // The inline asm MachineInstr currently only *uses* FP registers for the // 'f' constraint. These should be turned into the current ST(x) register // in the machine instr. diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 984db12201ed..e310fe069117 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,9 +1,8 @@ //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -585,23 +584,23 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, // registers. For the prolog expansion we use RAX, RCX and RDX. MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RegClass = &X86::GR64RegClass; - const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + const Register SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass), - ZeroReg = InProlog ? (unsigned)X86::RCX + ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - CopyReg = InProlog ? (unsigned)X86::RDX + CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - TestReg = InProlog ? (unsigned)X86::RDX + TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - FinalReg = InProlog ? (unsigned)X86::RDX + FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - RoundedReg = InProlog ? (unsigned)X86::RDX + RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), - LimitReg = InProlog ? (unsigned)X86::RCX + LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - JoinReg = InProlog ? (unsigned)X86::RCX + JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), - ProbeReg = InProlog ? (unsigned)X86::RCX + ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass); // SP-relative offsets where we can save RCX and RDX. @@ -654,9 +653,10 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) .addReg(CopyReg) .addReg(SizeReg); - BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg) .addReg(TestReg) - .addReg(ZeroReg); + .addReg(ZeroReg) + .addImm(X86::COND_B); // FinalReg now holds final stack pointer value, or zero if // allocation would overflow. Compare against the current stack @@ -673,7 +673,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addReg(X86::GS); BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); // Jump if the desired stack pointer is at or above the stack limit. - BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE); // Add code to roundMBB to round the final stack pointer to a page boundary. RoundMBB->addLiveIn(FinalReg); @@ -710,7 +710,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); - BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE); MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); @@ -794,8 +794,8 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addExternalSymbol(MF.createExternalSymbolName(Symbol)); } - unsigned AX = Is64Bit ? X86::RAX : X86::EAX; - unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX; + unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP; CI.addReg(AX, RegState::Implicit) .addReg(SP, RegState::Implicit) .addReg(AX, RegState::Define | RegState::Implicit) @@ -809,7 +809,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, // adjusting %rsp. // All other platforms do not specify a particular ABI for the stack probe // function, so we arbitrarily define it to not adjust %esp/%rsp itself. - BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP) + BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP) .addReg(SP) .addReg(AX); } @@ -872,6 +872,17 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MI->getOperand(3).setIsDead(); } +bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { + // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be + // clobbered by any interrupt handler. + assert(&STI == &MF.getSubtarget<X86Subtarget>() && + "MF used frame lowering for wrong subtarget"); + const Function &Fn = MF.getFunction(); + const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); + return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone); +} + + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -976,7 +987,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); - bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry(); // FIXME: Emit FPO data for EH funclets. @@ -1030,12 +1040,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) && + if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !UseStackProbe && // No stack probes. - !IsWin64CC && // Win64 has no Red Zone !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1774,6 +1783,15 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; + // In an x86 interrupt, remove the offset we added to account for the return + // address from any stack object allocated in the caller's frame. Interrupts + // do not have a standard return address. Fixed objects in the current frame, + // such as SSE register spills, should not get this treatment. + if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR && + Offset >= 0) { + Offset += getOffsetOfLocalArea(); + } + if (IsWin64Prologue) { assert(!MFI.hasCalls() || (StackSize % 16) == 8); @@ -1888,8 +1906,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // If !hasReservedCallFrame the function might have SP adjustement in the // body. So, even though the offset is statically known, it depends on where // we are in the function. - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF)) + if (!IgnoreSPUpdates && !hasReservedCallFrame(MF)) return getFrameIndexReference(MF, FI, FrameReg); // We don't handle tail calls, and shouldn't be seeing them either. @@ -2407,7 +2424,7 @@ void X86FrameLowering::adjustForSegmentedStacks( // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -2637,7 +2654,7 @@ void X86FrameLowering::adjustForHiPEPrologue( // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -2646,7 +2663,7 @@ void X86FrameLowering::adjustForHiPEPrologue( SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); + BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE); stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); @@ -2802,7 +2819,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); if (StackAdjustment) { - if (!(F.optForMinSize() && + if (!(F.hasMinSize() && adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment))) BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment, /*InEpilogue=*/false); @@ -3079,8 +3096,7 @@ void X86FrameLowering::orderFrameObjects( // Sort the objects using X86FrameSortingAlgorithm (see its comment for // info). - std::stable_sort(SortingObjects.begin(), SortingObjects.end(), - X86FrameSortingComparator()); + llvm::stable_sort(SortingObjects, X86FrameSortingComparator()); // Now modify the original list to represent the final order that // we want. The order will depend on whether we're going to access them @@ -3154,7 +3170,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8; int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; int UnwindHelpFI = - MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; // Store -2 into UnwindHelp on function entry. We have to scan forwards past diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 3bd805aae123..d32746e3a36e 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -1,9 +1,8 @@ //===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -172,6 +171,10 @@ public: unsigned getInitialCFARegister(const MachineFunction &MF) const override; + /// Return true if the function has a redzone (accessible bytes past the + /// frame of the top of stack function) as part of it's ABI. + bool has128ByteRedZone(const MachineFunction& MF) const; + private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def b/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def index 9cd3f96f83ac..0fdea9071c29 100644 --- a/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def +++ b/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def @@ -1,9 +1,8 @@ //===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index fe75dbd8eff4..95d31e62cafc 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -74,6 +73,7 @@ namespace { int JT; unsigned Align; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* + bool NegateIndex = false; X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), @@ -116,6 +116,8 @@ namespace { dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; dbgs() << " Scale " << Scale << '\n' << "IndexReg "; + if (NegateIndex) + dbgs() << "negate "; if (IndexReg.getNode()) IndexReg.getNode()->dump(DAG); else @@ -170,8 +172,8 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false), - OptForMinSize(false) {} + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false), + OptForMinSize(false), IndirectTlsSegRefs(false) {} StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -182,6 +184,13 @@ namespace { Subtarget = &MF.getSubtarget<X86Subtarget>(); IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( "indirect-tls-seg-refs"); + + // OptFor[Min]Size are used in pattern predicates that isel is matching. + OptForSize = MF.getFunction().hasOptSize(); + OptForMinSize = MF.getFunction().hasMinSize(); + assert((!OptForMinSize || OptForSize) && + "OptForMinSize implies OptForSize"); + SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -204,7 +213,7 @@ namespace { bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); - bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); @@ -252,16 +261,32 @@ namespace { void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, - SDValue &Base, SDValue &Scale, + MVT VT, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) - ? CurDAG->getTargetFrameIndex( - AM.Base_FrameIndex, - TLI->getPointerTy(CurDAG->getDataLayout())) - : AM.Base_Reg; + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Base = CurDAG->getTargetFrameIndex( + AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); + else if (AM.Base_Reg.getNode()) + Base = AM.Base_Reg; + else + Base = CurDAG->getRegister(0, VT); + Scale = getI8Imm(AM.Scale, DL); - Index = AM.IndexReg; + + // Negate the index if needed. + if (AM.NegateIndex) { + unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; + SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, + AM.IndexReg), 0); + AM.IndexReg = Neg; + } + + if (AM.IndexReg.getNode()) + Index = AM.IndexReg; + else + Index = CurDAG->getRegister(0, VT); + // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) @@ -290,7 +315,7 @@ namespace { if (AM.Segment.getNode()) Segment = AM.Segment; else - Segment = CurDAG->getRegister(0, MVT::i32); + Segment = CurDAG->getRegister(0, MVT::i16); } // Utility function to determine whether we should avoid selecting @@ -400,6 +425,19 @@ namespace { return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } + // Helper to detect unneeded and instructions on shift amounts. Called + // from PatFrags in tablegen. + bool isUnneededShiftMask(SDNode *N, unsigned Width) const { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); + const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); + + if (Val.countTrailingOnes() >= Width) + return true; + + APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; + return Mask.countTrailingOnes() >= Width; + } + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -464,6 +502,8 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); + bool tryShrinkShlLogicImm(SDNode *N); + bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); @@ -485,7 +525,7 @@ namespace { static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || - Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) { + Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. @@ -497,7 +537,7 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { } // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || - Opcode == X86ISD::FSETCCM_RND) + Opcode == X86ISD::FSETCCM_SAE) return true; return false; @@ -571,6 +611,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { Imm->getAPIntValue().getBitWidth() == 64 && Imm->getAPIntValue().isIntN(32)) return false; + + // If this really a zext_inreg that can be represented with a movzx + // instruction, prefer that. + // TODO: We could shrink the load and fold if it is non-volatile. + if (U->getOpcode() == ISD::AND && + (Imm->getAPIntValue() == UINT8_MAX || + Imm->getAPIntValue() == UINT16_MAX || + Imm->getAPIntValue() == UINT32_MAX)) + return false; + + // ADD/SUB with can negate the immediate and use the opposite operation + // to fit 128 into a sign extended 8 bit immediate. + if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && + (-Imm->getAPIntValue()).isSignedIntN(8)) + return false; } // If the other operand is a TLS address, we should fold it instead. @@ -720,11 +775,6 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptFor[Min]Size are used in pattern predicates that isel is matching. - OptForSize = MF->getFunction().optForSize(); - OptForMinSize = MF->getFunction().optForMinSize(); - assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. @@ -741,6 +791,143 @@ void X86DAGToDAGISel::PreprocessISelDAG() { continue; } + switch (N->getOpcode()) { + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + // Replace vector fp_to_s/uint with their X86 specific equivalent so we + // don't need 2 sets of patterns. + if (!N->getSimpleValueType(0).isVector()) + break; + + unsigned NewOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; + case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; + } + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: { + // Replace vector shifts with their X86 specific equivalent so we don't + // need 2 sets of patterns. + if (!N->getValueType(0).isVector()) + break; + + unsigned NewOpc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::SHL: NewOpc = X86ISD::VSHLV; break; + case ISD::SRA: NewOpc = X86ISD::VSRAV; break; + case ISD::SRL: NewOpc = X86ISD::VSRLV; break; + } + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: { + // Replace vector any extend with the zero extend equivalents so we don't + // need 2 sets of patterns. Ignore vXi1 extensions. + if (!N->getValueType(0).isVector() || + N->getOperand(0).getScalarValueSizeInBits() == 1) + break; + + unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND + ? ISD::ZERO_EXTEND + : ISD::ZERO_EXTEND_VECTOR_INREG; + + SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FRINT: { + // Replace fp rounding with their X86 specific equivalent so we don't + // need 2 sets of patterns. + unsigned Imm; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FCEIL: Imm = 0xA; break; + case ISD::FFLOOR: Imm = 0x9; break; + case ISD::FTRUNC: Imm = 0xB; break; + case ISD::FNEARBYINT: Imm = 0xC; break; + case ISD::FRINT: Imm = 0x4; break; + } + SDLoc dl(N); + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, + N->getValueType(0), + N->getOperand(0), + CurDAG->getConstant(Imm, dl, MVT::i8)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + case X86ISD::FANDN: + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: { + // Widen scalar fp logic ops to vector to reduce isel patterns. + // FIXME: Can we do this during lowering/combine. + MVT VT = N->getSimpleValueType(0); + if (VT.isVector() || VT == MVT::f128) + break; + + MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; + SDLoc dl(N); + SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, + N->getOperand(0)); + SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, + N->getOperand(1)); + + SDValue Res; + if (Subtarget->hasSSE2()) { + EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); + Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); + Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); + unsigned Opc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; + case X86ISD::FAND: Opc = ISD::AND; break; + case X86ISD::FOR: Opc = ISD::OR; break; + case X86ISD::FXOR: Opc = ISD::XOR; break; + } + Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); + Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); + } else { + Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); + } + Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, + CurDAG->getIntPtrConstant(0, dl)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + } + if (OptLevel != CodeGenOpt::None && // Only do this when the target can fold the load into the call or // jmp. @@ -786,65 +973,135 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // and the node legalization. As such this pass basically does "really // late" legalization of these inline with the X86 isel pass. // FIXME: This should only happen when not compiled with -O0. - if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) - continue; + switch (N->getOpcode()) { + default: continue; + case ISD::FP_ROUND: + case ISD::FP_EXTEND: + { + MVT SrcVT = N->getOperand(0).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; - MVT SrcVT = N->getOperand(0).getSimpleValueType(); - MVT DstVT = N->getSimpleValueType(0); + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(TLI); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; - // If any of the sources are vectors, no fp stack involved. - if (SrcVT.isVector() || DstVT.isVector()) - continue; + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(1)) + continue; + } - // If the source and destination are SSE registers, then this is a legal - // conversion that should not be lowered. - const X86TargetLowering *X86Lowering = - static_cast<const X86TargetLowering *>(TLI); - bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); - bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); - if (SrcIsSSE && DstIsSSE) - continue; + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + SDLoc dl(N); - if (!SrcIsSSE && !DstIsSSE) { - // If this is an FPStack extension, it is a noop. - if (N->getOpcode() == ISD::FP_EXTEND) + // FIXME: optimize the case where the src/dest is a load or store? + + SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), + MemTmp, MachinePointerInfo(), MemVT); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), MemVT); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + break; + } + + //The sequence of events for lowering STRICT_FP versions of these nodes requires + //dealing with the chain differently, as there is already a preexisting chain. + case ISD::STRICT_FP_ROUND: + case ISD::STRICT_FP_EXTEND: + { + MVT SrcVT = N->getOperand(1).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) continue; - // If this is a value-preserving FPStack truncation, it is a noop. - if (N->getConstantOperandVal(1)) + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(TLI); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) continue; - } - // Here we could have an FP stack truncation or an FPStack <-> SSE convert. - // FPStack has extload and truncstore. SSE can fold direct loads into other - // operations. Based on this, decide what we want to do. - MVT MemVT; - if (N->getOpcode() == ISD::FP_ROUND) - MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. - else - MemVT = SrcIsSSE ? SrcVT : DstVT; + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::STRICT_FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(2)) + continue; + } - SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); - SDLoc dl(N); + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT MemVT; + if (N->getOpcode() == ISD::STRICT_FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; - // FIXME: optimize the case where the src/dest is a load or store? - SDValue Store = - CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), - MemTmp, MachinePointerInfo(), MemVT); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, - MachinePointerInfo(), MemVT); + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + SDLoc dl(N); + + // FIXME: optimize the case where the src/dest is a load or store? + + //Since the operation is StrictFP, use the preexisting chain. + SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1), + MemTmp, MachinePointerInfo(), MemVT); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), MemVT); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesWith(N, Result.getNode()); + break; + } + } - // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the - // extload we created. This will cause general havok on the dag because - // anything below the conversion could be folded into other existing nodes. - // To avoid invalidating 'I', back it up to the convert node. - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; CurDAG->DeleteNode(N); } + + // The load+call transform above can leave some dead nodes in the graph. Make + // sure we remove them. Its possible some of the other transforms do to so + // just remove dead nodes unconditionally. + CurDAG->RemoveDeadNodes(); } // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. @@ -1220,20 +1477,25 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, // because it has a smaller encoding. // TODO: Which other code models can use this? - if (TM.getCodeModel() == CodeModel::Small && - Subtarget->is64Bit() && - AM.Scale == 1 && - AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() == nullptr && - AM.IndexReg.getNode() == nullptr && - AM.SymbolFlags == X86II::MO_NO_FLAG && - AM.hasSymbolicDisplacement()) - AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + switch (TM.getCodeModel()) { + default: break; + case CodeModel::Small: + case CodeModel::Kernel: + if (Subtarget->is64Bit() && + AM.Scale == 1 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr && + AM.SymbolFlags == X86II::MO_NO_FLAG && + AM.hasSymbolicDisplacement()) + AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + break; + } return false; } -bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, +bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth) { // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. @@ -1325,6 +1587,7 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, ShlCount); insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); + DAG.RemoveDeadNode(N.getNode()); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); return false; @@ -1334,13 +1597,31 @@ static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, - uint64_t Mask, - SDValue Shift, SDValue X, X86ISelAddressMode &AM) { + SDValue Shift = N.getOperand(0); + + // Use a signed mask so that shifting right will insert sign bits. These + // bits will be removed when we shift the result left so it doesn't matter + // what we use. This might allow a smaller immediate encoding. + int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); + + // If we have an any_extend feeding the AND, look through it to see if there + // is a shift behind it. But only if the AND doesn't use the extended bits. + // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? + bool FoundAnyExtend = false; + if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && + Shift.getOperand(0).getSimpleValueType() == MVT::i32 && + isUInt<32>(Mask)) { + FoundAnyExtend = true; + Shift = Shift.getOperand(0); + } + if (Shift.getOpcode() != ISD::SHL || !isa<ConstantSDNode>(Shift.getOperand(1))) return true; + SDValue X = Shift.getOperand(0); + // Not likely to be profitable if either the AND or SHIFT node has more // than one use (unless all uses are for address computation). Besides, // isel mechanism requires their node ids to be reused. @@ -1354,6 +1635,12 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, MVT VT = N.getSimpleValueType(); SDLoc DL(N); + if (FoundAnyExtend) { + SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); + insertDAGNode(DAG, N, NewX); + X = NewX; + } + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); @@ -1367,6 +1654,7 @@ static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << ShiftAmt; AM.IndexReg = NewAnd; @@ -1477,6 +1765,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewSRL; @@ -1535,6 +1824,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); + DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewAnd; @@ -1642,14 +1932,15 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + assert(N.getSimpleValueType().getSizeInBits() <= 64 && + "Unexpected value size!"); + SDValue And = N.getOperand(0); if (And.getOpcode() != ISD::AND) break; SDValue X = And.getOperand(0); - // We only handle up to 64-bit values here as those are what matter for - // addressing mode optimizations. - if (X.getSimpleValueType().getSizeInBits() > 64) break; - // The mask used for the transform is expected to be post-shift, but we // found the shift first so just apply the shift to the mask before passing // it down. @@ -1720,9 +2011,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { + N = Handle.getValue(); AM = Backup; break; } + N = Handle.getValue(); // Test if the index field is free for use. if (AM.IndexReg.getNode() || AM.isRIPRelative()) { AM = Backup; @@ -1730,7 +2023,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } int Cost = 0; - SDValue RHS = Handle.getValue().getOperand(1); + SDValue RHS = N.getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. @@ -1743,9 +2036,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. - // FIXME: Don't rely on DELETED_NODEs. if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && - AM.Base_Reg->getOpcode() != ISD::DELETED_NODE && !AM.Base_Reg.getNode()->hasOneUse()) || AM.BaseType == X86ISelAddressMode::FrameIndexBase) --Cost; @@ -1762,14 +2053,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } // Ok, the transformation is legal and appears profitable. Go for it. - SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType()); - SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); - AM.IndexReg = Neg; + // Negation will be emitted later to avoid creating dangling nodes if this + // was an unprofitable LEA. + AM.IndexReg = RHS; + AM.NegateIndex = true; AM.Scale = 1; - - // Insert the new nodes into the topological ordering. - insertDAGNode(*CurDAG, Handle.getValue(), Zero); - insertDAGNode(*CurDAG, Handle.getValue(), Neg); return false; } @@ -1797,37 +2085,77 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; - SDValue Shift = N.getOperand(0); - if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; - SDValue X = Shift.getOperand(0); - // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. - if (X.getSimpleValueType().getSizeInBits() > 64) break; + assert(N.getSimpleValueType().getSizeInBits() <= 64 && + "Unexpected value size!"); if (!isa<ConstantSDNode>(N.getOperand(1))) break; - uint64_t Mask = N.getConstantOperandVal(1); - // Try to fold the mask and shift into an extract and scale. - if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) - return false; + if (N.getOperand(0).getOpcode() == ISD::SRL) { + SDValue Shift = N.getOperand(0); + SDValue X = Shift.getOperand(0); - // Try to fold the mask and shift directly into the scale. - if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) - return false; + uint64_t Mask = N.getConstantOperandVal(1); + + // Try to fold the mask and shift into an extract and scale. + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift directly into the scale. + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift into BEXTR and scale. + if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + return false; + } // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. - if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) - return false; - - // Try to fold the mask and shift into BEXTR and scale. - if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) return false; break; } + case ISD::ZERO_EXTEND: { + // Try to widen a zexted shift left to the same size as its use, so we can + // match the shift as a scale factor. + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) + break; + if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) + break; + + // Give up if the shift is not a valid scale factor [1,2,3]. + SDValue Shl = N.getOperand(0); + auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); + if (!ShAmtC || ShAmtC->getZExtValue() > 3) + break; + + // The narrow shift must only shift out zero bits (it must be 'nuw'). + // That makes it safe to widen to the destination type. + APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), + ShAmtC->getZExtValue()); + if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) + break; + + // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); + SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); + + // Convert the shift to scale factor. + AM.Scale = 1 << ShAmtC->getZExtValue(); + AM.IndexReg = Zext; + + insertDAGNode(*CurDAG, N, Zext); + insertDAGNode(*CurDAG, N, NewShl); + CurDAG->ReplaceAllUsesWith(N, NewShl); + CurDAG->RemoveDeadNode(N.getNode()); + return false; + } } return matchAddressBase(N, AM); @@ -1893,17 +2221,14 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, if (AddrSpace == 258) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + // Try to match into the base and displacement fields. if (matchVectorAddress(N, AM)) return false; - MVT VT = N.getSimpleValueType(); - if (AM.BaseType == X86ISelAddressMode::RegBase) { - if (!AM.Base_Reg.getNode()) - AM.Base_Reg = CurDAG->getRegister(0, VT); - } - - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -1925,6 +2250,8 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::ENQCMD && // Fixme + Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = @@ -1938,19 +2265,14 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); } - if (matchAddress(N, AM)) - return false; - + // Save the DL and VT before calling matchAddress, it can invalidate N. + SDLoc DL(N); MVT VT = N.getSimpleValueType(); - if (AM.BaseType == X86ISelAddressMode::RegBase) { - if (!AM.Base_Reg.getNode()) - AM.Base_Reg = CurDAG->getRegister(0, VT); - } - if (!AM.IndexReg.getNode()) - AM.IndexReg = CurDAG->getRegister(0, VT); + if (matchAddress(N, AM)) + return false; - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -1982,12 +2304,14 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, if (!hasSingleUsesFromRoot(Root, Parent)) return false; - // We can allow a full vector load here since narrowing a load is ok. + // We can allow a full vector load here since narrowing a load is ok unless + // it's volatile. if (ISD::isNON_EXTLoad(N.getNode())) { - PatternNodeWithChain = N; - if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { - LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + LoadSDNode *LD = cast<LoadSDNode>(N); + if (!LD->isVolatile() && + IsProfitableToFold(N, LD, Root) && + IsLegalToFold(N, Parent, Root, OptLevel)) { + PatternNodeWithChain = N; return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); } @@ -2018,23 +2342,6 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, } } - // Also handle the case where we explicitly require zeros in the top - // elements. This is a vector shuffle from the zero vector. - if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && - // Check to see if the top elements are all zeros (or bitcast of zeros). - N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - N.getOperand(0).getNode()->hasOneUse()) { - PatternNodeWithChain = N.getOperand(0).getOperand(0); - if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && - IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { - // Okay, this is a zero extending load. Fold it. - LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, - Segment); - } - } - return false; } @@ -2085,14 +2392,12 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); - else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) { + else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { // Base could already be %rip, particularly in the x32 ABI. - Base = SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, DL, MVT::i64), - Base, - CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)), - 0); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, + MVT::i64), 0); + Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, + Base); } RN = dyn_cast<RegisterSDNode>(Index); @@ -2101,13 +2406,10 @@ bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, else { assert(Index.getValueType() == MVT::i32 && "Expect to be extending 32-bit registers for use in LEA"); - Index = SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, DL, MVT::i64), - Index, - CurDAG->getTargetConstant(X86::sub_32bit, DL, - MVT::i32)), - 0); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, + MVT::i64), 0); + Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, + Index); } return true; @@ -2136,18 +2438,13 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, AM.Segment = Copy; unsigned Complexity = 0; - if (AM.BaseType == X86ISelAddressMode::RegBase) - if (AM.Base_Reg.getNode()) - Complexity = 1; - else - AM.Base_Reg = CurDAG->getRegister(0, VT); + if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) + Complexity = 1; else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Complexity = 4; if (AM.IndexReg.getNode()) Complexity++; - else - AM.IndexReg = CurDAG->getRegister(0, VT); // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with // a simple shift. @@ -2167,14 +2464,14 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, Complexity += 2; } - if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode())) + if (AM.Disp) Complexity++; // If it isn't worth using an LEA, reject it. if (Complexity <= 2) return false; - getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } @@ -2188,17 +2485,15 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, X86ISelAddressMode AM; AM.GV = GA->getGlobal(); AM.Disp += GA->getOffset(); - AM.Base_Reg = CurDAG->getRegister(0, N.getValueType()); AM.SymbolFlags = GA->getTargetFlags(); - if (N.getValueType() == MVT::i32) { + MVT VT = N.getSimpleValueType(); + if (VT == MVT::i32) { AM.Scale = 1; AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); - } else { - AM.IndexReg = CurDAG->getRegister(0, MVT::i64); } - getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); return true; } @@ -2282,14 +2577,22 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { CR->getSignedMax().slt(1ull << Width); } -static X86::CondCode getCondFromOpc(unsigned Opc) { +static X86::CondCode getCondFromNode(SDNode *N) { + assert(N->isMachineOpcode() && "Unexpected node"); X86::CondCode CC = X86::COND_INVALID; - if (CC == X86::COND_INVALID) - CC = X86::getCondFromBranchOpc(Opc); - if (CC == X86::COND_INVALID) - CC = X86::getCondFromSETOpc(Opc); - if (CC == X86::COND_INVALID) - CC = X86::getCondFromCMovOpc(Opc); + unsigned Opc = N->getMachineOpcode(); + if (Opc == X86::JCC_1) + CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1)); + else if (Opc == X86::SETCCr) + CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0)); + else if (Opc == X86::SETCCm) + CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5)); + else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || + Opc == X86::CMOV64rr) + CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2)); + else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || + Opc == X86::CMOV64rm) + CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6)); return CC; } @@ -2315,7 +2618,7 @@ bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which only use the zero flag. @@ -2351,7 +2654,7 @@ bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which don't examine the SF flag. @@ -2412,7 +2715,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) { if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. - X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode()); + X86::CondCode CC = getCondFromNode(*FlagUI); if (mayUseCarryFlag(CC)) return false; @@ -2590,10 +2893,13 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { return false; bool IsCommutable = false; + bool IsNegate = false; switch (Opc) { default: return false; case X86ISD::SUB: + IsNegate = isNullConstant(StoredVal.getOperand(0)); + break; case X86ISD::SBB: break; case X86ISD::ADD: @@ -2605,7 +2911,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { break; } - unsigned LoadOpNo = 0; + unsigned LoadOpNo = IsNegate ? 1 : 0; LoadSDNode *LoadNode = nullptr; SDValue InputChain; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, @@ -2643,11 +2949,20 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { MachineSDNode *Result; switch (Opc) { - case X86ISD::ADD: case X86ISD::SUB: + // Handle negate. + if (IsNegate) { + unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, + X86::NEG8m); + const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, + MVT::Other, Ops); + break; + } + LLVM_FALLTHROUGH; + case X86ISD::ADD: // Try to match inc/dec. - if (!Subtarget->slowIncDec() || - CurDAG->getMachineFunction().getFunction().optForSize()) { + if (!Subtarget->slowIncDec() || OptForSize) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. @@ -2748,16 +3063,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { // See if the operand is a constant that we can fold into an immediate // operand. if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { - auto OperandV = OperandC->getAPIntValue(); + int64_t OperandV = OperandC->getSExtValue(); // Check if we can shrink the operand enough to fit in an immediate (or // fit into a smaller immediate) by negating it and switching the // operation. if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && - ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 && - (-OperandV).getMinSignedBits() <= 8) || - (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && - (-OperandV).getMinSignedBits() <= 32)) && + ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || + (MemVT == MVT::i64 && !isInt<32>(OperandV) && + isInt<32>(-OperandV))) && hasNoCarryFlagUses(StoredVal.getValue(1))) { OperandV = -OperandV; Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; @@ -2765,11 +3079,10 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { // First try to fit this into an Imm8 operand. If it doesn't fit, then try // the larger immediate operand. - if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) { + if (MemVT != MVT::i8 && isInt<8>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImm8Opcode(Opc); - } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() && - (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) { + } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImmOpcode(Opc); } @@ -2829,8 +3142,6 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { if (NVT != MVT::i32 && NVT != MVT::i64) return false; - unsigned Size = NVT.getSizeInBits(); - SDValue NBits; // If we have BMI2's BZHI, we are ok with muti-use patterns. @@ -2843,16 +3154,27 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; + auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { + if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { + assert(V.getSimpleValueType() == MVT::i32 && + V.getOperand(0).getSimpleValueType() == MVT::i64 && + "Expected i64 -> i32 truncation"); + V = V.getOperand(0); + } + return V; + }; + // a) x & ((1 << nbits) + (-1)) - auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool { + auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, + &NBits](SDValue Mask) -> bool { // Match `add`. Must only have one use! if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) return false; // We should be adding all-ones constant (i.e. subtracting one.) if (!isAllOnesConstant(Mask->getOperand(1))) return false; - // Match `1 << nbits`. Must only have one use! - SDValue M0 = Mask->getOperand(0); + // Match `1 << nbits`. Might be truncated. Must only have one use! + SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isOneConstant(M0->getOperand(0))) @@ -2861,23 +3183,36 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { return true; }; + auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { + V = peekThroughOneUseTruncation(V); + return CurDAG->MaskedValueIsAllOnes( + V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), + NVT.getSizeInBits())); + }; + // b) x & ~(-1 << nbits) - auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool { + auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, + &NBits](SDValue Mask) -> bool { // Match `~()`. Must only have one use! - if (!isBitwiseNot(Mask) || !checkOneUse(Mask)) + if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) return false; - // Match `-1 << nbits`. Must only have one use! - SDValue M0 = Mask->getOperand(0); + // The -1 only has to be all-ones for the final Node's NVT. + if (!isAllOnes(Mask->getOperand(1))) + return false; + // Match `-1 << nbits`. Might be truncated. Must only have one use! + SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; - if (!isAllOnesConstant(M0->getOperand(0))) + // The -1 only has to be all-ones for the final Node's NVT. + if (!isAllOnes(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; // Match potentially-truncated (bitwidth - y) - auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) { + auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, + unsigned Bitwidth) { // Skip over a truncate of the shift amount. if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { ShiftAmt = ShiftAmt.getOperand(0); @@ -2889,52 +3224,56 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { if (ShiftAmt.getOpcode() != ISD::SUB) return false; auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); - if (!V0 || V0->getZExtValue() != Size) + if (!V0 || V0->getZExtValue() != Bitwidth) return false; NBits = ShiftAmt.getOperand(1); return true; }; // c) x & (-1 >> (32 - y)) - auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool { + auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, + matchShiftAmt](SDValue Mask) -> bool { + // The mask itself may be truncated. + Mask = peekThroughOneUseTruncation(Mask); + unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); // Match `l>>`. Must only have one use! if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) return false; - // We should be shifting all-ones constant. + // We should be shifting truly all-ones constant. if (!isAllOnesConstant(Mask.getOperand(0))) return false; SDValue M1 = Mask.getOperand(1); // The shift amount should not be used externally. if (!checkOneUse(M1)) return false; - return matchShiftAmt(M1); + return matchShiftAmt(M1, Bitwidth); }; SDValue X; // d) x << (32 - y) >> (32 - y) - auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt, + auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, &X](SDNode *Node) -> bool { if (Node->getOpcode() != ISD::SRL) return false; SDValue N0 = Node->getOperand(0); if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) return false; + unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); SDValue N1 = Node->getOperand(1); SDValue N01 = N0->getOperand(1); // Both of the shifts must be by the exact same value. // There should not be any uses of the shift amount outside of the pattern. if (N1 != N01 || !checkTwoUse(N1)) return false; - if (!matchShiftAmt(N1)) + if (!matchShiftAmt(N1, Bitwidth)) return false; X = N0->getOperand(0); return true; }; - auto matchLowBitMask = [&matchPatternA, &matchPatternB, - &matchPatternC](SDValue Mask) -> bool { - // FIXME: pattern c. + auto matchLowBitMask = [matchPatternA, matchPatternB, + matchPatternC](SDValue Mask) -> bool { return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); }; @@ -2954,42 +3293,46 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { SDLoc DL(Node); - // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically* - // shifted (potentially with one-use trunc inbetween), - // and if so look past one-use truncation. - MVT XVT = NVT; - if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE && - X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) { - assert(NVT == MVT::i32 && "Expected target valuetype to be i32"); - X = X.getOperand(0); - XVT = X.getSimpleValueType(); - assert(XVT == MVT::i64 && "Expected truncation from i64"); - } + // Truncate the shift amount. + NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); - SDValue OrigNBits = NBits; - if (NBits.getValueType() != XVT) { - // Truncate the shift amount. - NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); - insertDAGNode(*CurDAG, OrigNBits, NBits); - - // Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit) - // register. All the other bits are undefined, we do not care about them. - SDValue ImplDef = - SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0); - insertDAGNode(*CurDAG, OrigNBits, ImplDef); - NBits = - CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits); - insertDAGNode(*CurDAG, OrigNBits, NBits); - } + // Insert 8-bit NBits into lowest 8 bits of 32-bit register. + // All the other bits are undefined, we do not care about them. + SDValue ImplDef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); + insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); + NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef, + NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { // Great, just emit the the BZHI.. - SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits); + if (NVT != MVT::i32) { + // But have to place the bit count into the wide-enough register first. + NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); + insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); + } + + SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } + // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is + // *logically* shifted (potentially with one-use trunc inbetween), + // and the truncation was the only use of the shift, + // and if so look past one-use truncation. + { + SDValue RealX = peekThroughOneUseTruncation(X); + // FIXME: only if the shift is one-use? + if (RealX != X && RealX.getOpcode() == ISD::SRL) + X = RealX; + } + + MVT XVT = X.getSimpleValueType(); + // Else, emitting BEXTR requires one more step. // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location @@ -2999,10 +3342,11 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); - SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8); - insertDAGNode(*CurDAG, OrigNBits, Control); + SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); // If the 'X' is *logically* shifted, we can fold that shift into 'control'. + // FIXME: only if the shift is one-use? if (X.getOpcode() == ISD::SRL) { SDValue ShiftAmt = X.getOperand(1); X = X.getOperand(0); @@ -3011,13 +3355,20 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { "Expected shift amount to be i8"); // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! + // We could zext to i16 in some form, but we intentionally don't do that. SDValue OrigShiftAmt = ShiftAmt; - ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt); + ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); // And now 'or' these low 8 bits of shift amount into the 'control'. - Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt); - insertDAGNode(*CurDAG, OrigNBits, Control); + Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); + } + + // But have to place the 'control' into the wide-enough register first. + if (XVT != MVT::i32) { + Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); + insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // And finally, form the BEXTR itself. @@ -3025,7 +3376,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // The 'X' was originally truncated. Do that now. if (XVT != NVT) { - insertDAGNode(*CurDAG, OrigNBits, Extract); + insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); } @@ -3106,14 +3457,14 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; - SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. - ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); } return NewNode; @@ -3271,6 +3622,119 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { return true; } +bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { + MVT NVT = N->getSimpleValueType(0); + unsigned Opcode = N->getOpcode(); + SDLoc dl(N); + + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue Shift = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + if (!Cst) + return false; + + int64_t Val = Cst->getSExtValue(); + + // If we have an any_extend feeding the AND, look through it to see if there + // is a shift behind it. But only if the AND doesn't use the extended bits. + // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? + bool FoundAnyExtend = false; + if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && + Shift.getOperand(0).getSimpleValueType() == MVT::i32 && + isUInt<32>(Val)) { + FoundAnyExtend = true; + Shift = Shift.getOperand(0); + } + + if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) + return false; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); + if (!ShlCst) + return false; + + uint64_t ShAmt = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + return false; + + // Check the minimum bitwidth for the new constant. + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { + if (Opcode == ISD::AND) { + // AND32ri is the same as AND64ri32 with zext imm. + // Try this before sign extended immediates below. + ShiftedVal = (uint64_t)Val >> ShAmt; + if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) + return true; + // Also swap order when the AND can become MOVZX. + if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) + return true; + } + ShiftedVal = Val >> ShAmt; + if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || + (!isInt<32>(Val) && isInt<32>(ShiftedVal))) + return true; + if (Opcode != ISD::AND) { + // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr + ShiftedVal = (uint64_t)Val >> ShAmt; + if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) + return true; + } + return false; + }; + + int64_t ShiftedVal; + if (!CanShrinkImmediate(ShiftedVal)) + return false; + + // Ok, we can reorder to get a smaller immediate. + + // But, its possible the original immediate allowed an AND to become MOVZX. + // Doing this late due to avoid the MakedValueIsZero call as late as + // possible. + if (Opcode == ISD::AND) { + // Find the smallest zext this could possibly be. + unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); + ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); + + // Figure out which bits need to be zero to achieve that mask. + APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), + ZExtWidth); + NeededMask &= ~Cst->getAPIntValue(); + + if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) + return false; + } + + SDValue X = Shift.getOperand(0); + if (FoundAnyExtend) { + SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); + insertDAGNode(*CurDAG, SDValue(N, 0), NewX); + X = NewX; + } + + SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); + insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); + SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); + insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); + SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, + Shift.getOperand(1)); + ReplaceNode(N, NewSHL.getNode()); + SelectCode(NewSHL.getNode()); + return true; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3341,6 +3805,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { return true; } +static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, + bool FoldedBCast, bool Masked) { + if (Masked) { + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; + } + } + + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; + } +} + +// Try to create VPTESTM instruction. If InMask is not null, it will be used +// to form a masked operation. +bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, + SDValue InMask) { + assert(Subtarget->hasAVX512() && "Expected AVX512!"); + assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected VT!"); + + // Look for equal and not equal compares. + ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return false; + + // See if we're comparing against zero. This should have been canonicalized + // to RHS during lowering. + if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) + return false; + + SDValue N0 = Setcc.getOperand(0); + + MVT CmpVT = N0.getSimpleValueType(); + MVT CmpSVT = CmpVT.getVectorElementType(); + + // Start with both operands the same. We'll try to refine this. + SDValue Src0 = N0; + SDValue Src1 = N0; + + { + // Look through single use bitcasts. + SDValue N0Temp = N0; + if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) + N0Temp = N0.getOperand(0); + + // Look for single use AND. + if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { + Src0 = N0Temp.getOperand(0); + Src1 = N0Temp.getOperand(1); + } + } + + // Without VLX we need to widen the load. + bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); + + // We can only fold loads if the sources are unique. + bool CanFoldLoads = Src0 != Src1; + + // Try to fold loads unless we need to widen. + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; + if (!Widen && CanFoldLoads) { + Load = Src1; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4); + if (!FoldedLoad) { + // And is computative. + Load = Src0; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4); + if (FoldedLoad) + std::swap(Src0, Src1); + } + } + + auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { + // Look through single use bitcasts. + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) + Src = Src.getOperand(0); + + if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + Parent = Src.getNode(); + Src = Src.getOperand(0); + if (Src.getSimpleValueType() == CmpSVT) + return Src; + } + + return SDValue(); + }; + + // If we didn't fold a load, try to match broadcast. No widening limitation + // for this. But only 32 and 64 bit types are supported. + bool FoldedBCast = false; + if (!FoldedLoad && CanFoldLoads && + (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { + SDNode *ParentNode = nullptr; + if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + } + + // Try the other operand. + if (!FoldedBCast) { + if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + if (FoldedBCast) + std::swap(Src0, Src1); + } + } + } + + auto getMaskRC = [](MVT MaskVT) { + switch (MaskVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v2i1: return X86::VK2RegClassID; + case MVT::v4i1: return X86::VK4RegClassID; + case MVT::v8i1: return X86::VK8RegClassID; + case MVT::v16i1: return X86::VK16RegClassID; + case MVT::v32i1: return X86::VK32RegClassID; + case MVT::v64i1: return X86::VK64RegClassID; + } + }; + + bool IsMasked = InMask.getNode() != nullptr; + + SDLoc dl(Root); + + MVT ResVT = Setcc.getSimpleValueType(); + MVT MaskVT = ResVT; + if (Widen) { + // Widen the inputs using insert_subreg or copy_to_regclass. + unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; + unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; + unsigned NumElts = CmpVT.getVectorNumElements() * Scale; + CmpVT = MVT::getVectorVT(CmpSVT, NumElts); + MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, + CmpVT), 0); + Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); + + assert(!FoldedLoad && "Shouldn't have folded the load"); + if (!FoldedBCast) + Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); + + if (IsMasked) { + // Widen the mask. + unsigned RegClass = getMaskRC(MaskVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, MaskVT, InMask, RC), 0); + } + } + + bool IsTestN = CC == ISD::SETEQ; + unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, + IsMasked); + + MachineSDNode *CNode; + if (FoldedLoad || FoldedBCast) { + SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); + + if (IsMasked) { + SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } else { + SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } + + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()}); + } else { + if (IsMasked) + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); + else + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); + } + + // If we widened, we need to shrink the mask VT. + if (Widen) { + unsigned RegClass = getMaskRC(ResVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, ResVT, SDValue(CNode, 0), RC); + } + + ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Root); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); @@ -3354,6 +4159,61 @@ void X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::INTRINSIC_VOID: { + unsigned IntNo = Node->getConstantOperandVal(1); + switch (IntNo) { + default: break; + case Intrinsic::x86_sse3_monitor: + case Intrinsic::x86_monitorx: + case Intrinsic::x86_clzero: { + bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; + + unsigned Opc = 0; + switch (IntNo) { + case Intrinsic::x86_sse3_monitor: + if (!Subtarget->hasSSE3()) + break; + Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; + break; + case Intrinsic::x86_monitorx: + if (!Subtarget->hasMWAITX()) + break; + Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; + break; + case Intrinsic::x86_clzero: + if (!Subtarget->hasCLZERO()) + break; + Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; + break; + } + + if (Opc) { + unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; + SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, + Node->getOperand(2), SDValue()); + SDValue InFlag = Chain.getValue(1); + + if (IntNo == Intrinsic::x86_sse3_monitor || + IntNo == Intrinsic::x86_monitorx) { + // Copy the other two operands to ECX and EDX. + Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), + InFlag); + InFlag = Chain.getValue(1); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), + InFlag); + InFlag = Chain.getValue(1); + } + + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + { Chain, InFlag}); + ReplaceNode(Node, CNode); + return; + } + } + } + + break; + } case ISD::BRIND: { if (Subtarget->isTargetNaCl()) // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We @@ -3389,13 +4249,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; - case X86ISD::BLENDV: { - // BLENDV selects like a regular VSELECT. - SDValue VSelect = CurDAG->getNode( - ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + case ISD::VSELECT: { + // Replace VSELECT with non-mask conditions with with BLENDV. + if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) + break; + + assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + SDValue Blendv = CurDAG->getNode( + X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2)); - ReplaceNode(Node, VSelect.getNode()); - SelectCode(VSelect.getNode()); + ReplaceNode(Node, Blendv.getNode()); + SelectCode(Blendv.getNode()); // We already called ReplaceUses. return; } @@ -3411,6 +4275,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: + if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { + // Try to form a masked VPTESTM. Operands can be in either order. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && + tryVPTESTM(Node, N0, N1)) + return; + if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && + tryVPTESTM(Node, N1, N0)) + return; + } + if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); @@ -3423,89 +4299,113 @@ void X86DAGToDAGISel::Select(SDNode *Node) { LLVM_FALLTHROUGH; case ISD::OR: - case ISD::XOR: { - - // For operations of the form (x << C1) op C2, check if we can use a smaller - // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); + case ISD::XOR: + if (tryShrinkShlLogicImm(Node)) + return; - if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + LLVM_FALLTHROUGH; + case ISD::ADD: + case ISD::SUB: { + // Try to avoid folding immediates with multiple uses for optsize. + // This code tries to select to register form directly to avoid going + // through the isel table which might fold the immediate. We can't change + // the patterns on the add/sub/and/or/xor with immediate paterns in the + // tablegen files to check immediate use count without making the patterns + // unavailable to the fast-isel table. + if (!OptForSize) break; - // i8 is unshrinkable, i16 should be promoted to i32. - if (NVT != MVT::i32 && NVT != MVT::i64) + // Only handle i8/i16/i32/i64. + if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) break; + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); - ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); - if (!Cst || !ShlCst) + if (!Cst) break; int64_t Val = Cst->getSExtValue(); - uint64_t ShlVal = ShlCst->getZExtValue(); - // Make sure that we don't change the operation by removing bits. - // This only matters for OR and XOR, AND is unaffected. - uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; - if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + // Make sure its an immediate that is considered foldable. + // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. + if (!isInt<8>(Val) && !isInt<32>(Val)) break; - unsigned ShlOp, AddOp, Op; - MVT CstVT = NVT; - - // Check the minimum bitwidth for the new constant. - // TODO: AND32ri is the same as AND64ri32 with zext imm. - // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr - // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. - if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) - CstVT = MVT::i8; - else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) - CstVT = MVT::i32; - - // Bail if there is no smaller encoding. - if (NVT == CstVT) + // Check if we should avoid folding this immediate. + if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; + // We should not fold the immediate. So we need a register form instead. + unsigned ROpc, MOpc; switch (NVT.SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); + default: llvm_unreachable("Unexpected VT!"); + case MVT::i8: + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; + case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; + case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; + case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; + case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; + } + break; + case MVT::i16: + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; + case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; + case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; + case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; + case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; + } + break; case MVT::i32: - assert(CstVT == MVT::i8); - ShlOp = X86::SHL32ri; - AddOp = X86::ADD32rr; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case ISD::AND: Op = X86::AND32ri8; break; - case ISD::OR: Op = X86::OR32ri8; break; - case ISD::XOR: Op = X86::XOR32ri8; break; + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; + case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; + case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; + case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; + case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; } break; case MVT::i64: - assert(CstVT == MVT::i8 || CstVT == MVT::i32); - ShlOp = X86::SHL64ri; - AddOp = X86::ADD64rr; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; - case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; - case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + default: llvm_unreachable("Unexpected opcode!"); + case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; + case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; + case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; + case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; + case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; } break; } - // Emit the smaller op and the shift. - SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT); - SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); - if (ShlVal == 1) - CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0), - SDValue(New, 0)); - else - CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), - getI8Imm(ShlVal, dl)); + // Ok this is a AND/OR/XOR/ADD/SUB with constant. + + // If this is a not a subtract, we can still try to fold a load. + if (Opcode != ISD::SUB) { + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } + + CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); return; } + case X86ISD::SMUL: // i16/i32/i64 are handled with isel patterns. if (NVT != MVT::i8) @@ -3903,7 +4803,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = countTrailingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, + SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); @@ -3914,7 +4814,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned LeadingZeros = countLeadingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); SDValue Shift = - SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, + SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); @@ -3972,8 +4872,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } - // FIXME: We should be able to fold loads here. - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); SDValue Reg = N0.getOperand(0); @@ -4066,10 +4964,46 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; } + case ISD::SETCC: { + if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) + return; + + break; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; break; + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FRINT: { + // Replace fp rounding with their X86 specific equivalent so we don't + // need 2 sets of patterns. + // FIXME: This can only happen when the nodes started as STRICT_* and have + // been mutated into their non-STRICT equivalents. Eventually this + // mutation will be removed and we should switch the STRICT_ nodes to a + // strict version of RNDSCALE in PreProcessISelDAG. + unsigned Imm; + switch (Node->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::FCEIL: Imm = 0xA; break; + case ISD::FFLOOR: Imm = 0x9; break; + case ISD::FTRUNC: Imm = 0xB; break; + case ISD::FNEARBYINT: Imm = 0xC; break; + case ISD::FRINT: Imm = 0x4; break; + } + SDLoc dl(Node); + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, + Node->getValueType(0), + Node->getOperand(0), + CurDAG->getConstant(Imm, dl, MVT::i8)); + ReplaceNode(Node, Res.getNode()); + SelectCode(Res.getNode()); + return; + } } SelectCode(Node); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 2dfee3a4701e..0b4bf687e6cf 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -131,7 +130,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addBypassSlowDiv(64, 32); } - if (Subtarget.isTargetKnownWindowsMSVC() || + if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -159,6 +158,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setUseUnderscoreLongJmp(true); } + // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to + // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. + // FIXME: Should we be limitting the atomic size on other configs? Default is + // 1024. + if (!Subtarget.hasCmpxchg8b()) + setMaxAtomicSizeInBitsSupported(32); + // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); @@ -190,10 +196,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i64 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } + setOperationAction(ISD::ABS , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { @@ -258,14 +263,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); @@ -415,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + else + setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -486,6 +487,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } + if (!Subtarget.is64Bit()) + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } @@ -530,6 +534,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); + // Disable f32->f64 extload as we can only generate this in one instruction + // under optsize. So its easier to pattern match (fpext (load)) for that + // case instead of needing to emit 2 instructions for extload in the + // non-optsize case. + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); @@ -668,6 +678,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); + setOperationAction(ISD::LROUND, MVT::f80, Expand); + setOperationAction(ISD::LLROUND, MVT::f80, Expand); + setOperationAction(ISD::LRINT, MVT::f80, Expand); + setOperationAction(ISD::LLRINT, MVT::f80, Expand); } // Always use a library call for pow. @@ -780,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -841,6 +858,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); if (!ExperimentalVectorWideningLegalization) { // Use widening instead of promotion. @@ -950,17 +971,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); - // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. - setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); - setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); @@ -1128,14 +1144,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); - // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1144,13 +1156,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); } - if (ExperimentalVectorWideningLegalization) { - // These types need custom splitting if their input is a 128-bit vector. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - } + // These types need custom splitting if their input is a 128-bit vector. + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); @@ -1182,9 +1192,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); - // TODO - remove this once 256-bit X86ISD::ANDNP correctly split. - setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom); - // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); @@ -1260,7 +1267,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } @@ -1282,6 +1289,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) @@ -1352,19 +1360,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -1378,9 +1381,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - for (MVT VT : MVT::fp_vector_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); @@ -1413,10 +1413,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE + // to 512-bit rather than use the AVX2 instructions so that we can use + // k-masks. if (!Subtarget.hasVLX()) { - // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE - // to 512-bit rather than use the AVX2 instructions so that we can use - // k-masks. for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); @@ -1446,6 +1446,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Custom); } // Without BWI we need to use custom lowering to handle MVT::v64i8 input. @@ -1465,13 +1467,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v8f64, Custom); - setOperationAction(ISD::SELECT, MVT::v8i64, Custom); - setOperationAction(ISD::SELECT, MVT::v16i32, Custom); - setOperationAction(ISD::SELECT, MVT::v32i16, Custom); - setOperationAction(ISD::SELECT, MVT::v64i8, Custom); - setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1485,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1705,6 +1701,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1788,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't @@ -1842,8 +1838,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. - if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium())) + if (Subtarget.is32Bit() && + (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) @@ -1854,6 +1850,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); @@ -1881,6 +1878,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -2050,20 +2050,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) { - if (Size >= 16 && - (!Subtarget.isUnalignedMem16Slow() || - ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16)))) { +/// For vector ops we check that the overall size isn't larger than our +/// preferred vector width. +EVT X86TargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. - if (Size >= 32 && Subtarget.hasAVX()) { + if (Size >= 32 && Subtarget.hasAVX() && + (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, @@ -2071,11 +2070,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // multiply) before we splat as a vector. return MVT::v32i8; } - if (Subtarget.hasSSE2()) + if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. - if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87())) + if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && + (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { @@ -2104,11 +2104,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { return true; } -bool -X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, - bool *Fast) const { +bool X86TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: @@ -2124,6 +2122,16 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // TODO: What about AVX-512 (512-bit) accesses? } } + // NonTemporal vector memory ops must be aligned. + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + // NT loads can only be vector aligned, so if its less aligned than the + // minimum vector size (which we can split the vector down to), we might as + // well use a regular unaligned vector load. + // We don't have any NT loads pre-SSE41. + if (!!(Flags & MachineMemOperand::MOLoad)) + return (Align < 16 || !Subtarget.hasSSE41()); + return false; + } // Misaligned accesses of any size are always allowed. return true; } @@ -2281,12 +2289,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. - auto *SecurityCheckCookie = cast<Function>( - M.getOrInsertFunction("__security_check_cookie", - Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()))); - SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); - SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { + F->setCallingConv(CallingConv::X86_FastCall); + F->addAttribute(1, Attribute::AttrKind::InReg); + } return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. @@ -2304,7 +2313,7 @@ Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { return TargetLowering::getSDagStackGuard(M); } -Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { +Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { @@ -2347,8 +2356,6 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "X86GenCallingConv.inc" - bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { @@ -2703,7 +2710,6 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, "The values should reside in two registers"); SDValue Lo, Hi; - unsigned Reg; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); @@ -2713,7 +2719,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. - Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); @@ -2934,6 +2940,8 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: + // Swift: + case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); @@ -2986,22 +2994,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, else ValVT = VA.getValVT(); - // Calculate SP offset of interrupt parameter, re-arrange the slot normally - // taken by a return address. - int Offset = 0; - if (CallConv == CallingConv::X86_INTR) { - // X86 interrupts may take one or two arguments. - // On the stack there will be no return address as in regular call. - // Offset of last argument need to be set to -4/-8 bytes. - // Where offset of the first argument out of two, should be set to 0 bytes. - Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); - if (Subtarget.is64Bit() && Ins.size() == 2) { - // The stack pointer needs to be realigned for 64 bit handlers with error - // code, so the argument offset changes by 8 bytes. - Offset += 8; - } - } - // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -3014,15 +3006,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // can be improved with deeper analysis. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, /*isAliased=*/true); - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. - if (Flags.isCopyElisionCandidate()) { + // If the argument is passed directly in memory without any extension, then we + // can perform copy elision. Large vector types, for example, may be passed + // indirectly by pointer. + if (Flags.isCopyElisionCandidate() && + VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { @@ -3031,7 +3023,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), - /*Immutable=*/false); + /*IsImmutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, @@ -3072,11 +3064,6 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, MFI.setObjectSExt(FI, true); } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, @@ -3166,14 +3153,6 @@ SDValue X86TargetLowering::LowerFormalArguments( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); - if (CallConv == CallingConv::X86_INTR) { - bool isLegal = Ins.size() == 1 || - (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || - (!Is64Bit && Ins[1].VT == MVT::i32))); - if (!isLegal) - report_fatal_error("X86 interrupts may take one or two arguments"); - } - // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -3454,11 +3433,11 @@ SDValue X86TargetLowering::LowerFormalArguments( } // Copy all forwards from physical to virtual registers. - for (ForwardedRegister &F : Forwards) { + for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); - Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); } } @@ -3610,6 +3589,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); + MachineFunction::CallSiteInfo CSInfo; + if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); @@ -3805,6 +3786,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. @@ -3975,46 +3959,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (Callee->getOpcode() == ISD::GlobalAddress) { - // If the callee is a GlobalAddress node (quite common, every direct call - // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack - // it. - GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); - - // We should use extra load for direct calls to dllimported functions in - // non-JIT mode. - const GlobalValue *GV = G->getGlobal(); - if (!GV->hasDLLImportStorageClass()) { - unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); - - Callee = DAG.getTargetGlobalAddress( - GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - // Add a wrapper. - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - // Add extra indirection - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } - } - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlags = - Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); - - Callee = DAG.getTargetExternalSymbol( - S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); - - if (OpFlags == X86II::MO_GOTPCREL) { - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, - getPointerTy(DAG.getDataLayout()), Callee); - Callee = DAG.getLoad( - getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - } + } else if (Callee->getOpcode() == ISD::GlobalAddress || + Callee->getOpcode() == ISD::ExternalSymbol) { + // Lower direct calls to global addresses and external symbols. Setting + // ForCall to true here has the effect of removing WrapperRIP when possible + // to allow direct calls to be selected without first materializing the + // address into a register. + Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI @@ -4105,7 +4056,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } if (HasNoCfCheck && IsCFProtectionSupported) { @@ -4114,6 +4067,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; @@ -4787,7 +4741,6 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!IntrData) return false; - Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags = MachineMemOperand::MONone; Info.offset = 0; @@ -4795,6 +4748,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { + Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; @@ -4810,6 +4764,31 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; break; } + case GATHER: + case GATHER_AVX2: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOLoad; + break; + } + case SCATTER: { + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOStore; + break; + } default: return false; } @@ -4820,7 +4799,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; @@ -4837,6 +4817,26 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + + // If this is an (1) AVX vector load with (2) multiple uses and (3) all of + // those uses are extracted directly into a store, then the extract + store + // can be store-folded. Therefore, it's probably not worth splitting the load. + EVT VT = Load->getValueType(0); + if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { + for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { + // Skip uses of the chain value. Result 0 of the node is the load value. + if (UI.getUse().getResNo() != 0) + continue; + + // If this use is not an extract + store, it's probably worth splitting. + if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || + UI->use_begin()->getOpcode() != ISD::STORE) + return true; + } + // All non-chain uses are extract + store. + return false; + } + return true; } @@ -4909,15 +4909,29 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END) + return false; + // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); - if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT)) + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); - return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + +bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { + // TODO: Allow vectors? + if (VT.isVector()) + return false; + return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz() const { @@ -4930,8 +4944,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, - EVT BitcastVT) const { +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; @@ -4939,7 +4954,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; - return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); + // If both types are legal vectors, it's always ok to convert them. + if (LoadVT.isVector() && BitcastVT.isVector() && + isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) + return true; + + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, @@ -4953,6 +4973,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } + // Make sure we don't merge greater than our preferred vector + // width. + if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) + return false; return true; } @@ -4998,7 +5022,25 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const { return Subtarget.hasSSE2(); } -bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const { +bool X86TargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + EVT VT = N->getValueType(0); + if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || + (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { + // Only fold if the shift values are equal - so it folds to AND. + // TODO - we should fold if either is a non-uniform vector but we don't do + // the fold for non-splats yet. + return N->getOperand(1) == N->getOperand(0).getOperand(1); + } + return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); +} + +bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. @@ -5048,8 +5090,8 @@ static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } -/// Return true if every element in Mask, beginning -/// from position Pos and ending in Pos+Size is the undef sentinel value. +/// Return true if every element in Mask, beginning from position Pos and ending +/// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (Mask[i] != SM_SentinelUndef) @@ -5057,6 +5099,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { return true; } +/// Return true if the mask creates a vector whose lower half is undefined. +static bool isUndefLowerHalf(ArrayRef<int> Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, 0, NumElts / 2); +} + +/// Return true if the mask creates a vector whose upper half is undefined. +static bool isUndefUpperHalf(ArrayRef<int> Mask) { + unsigned NumElts = Mask.size(); + return isUndefInRange(Mask, NumElts / 2, NumElts / 2); +} + /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); @@ -5409,6 +5463,53 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, DAG.getIntPtrConstant(0, dl)); } +/// Widen a vector to a larger size with the same scalar type, with the new +/// elements either zero or undef. +static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, unsigned WideSizeInBits) { + assert(Vec.getValueSizeInBits() < WideSizeInBits && + (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && + "Unsupported vector widening type"); + unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); + MVT SVT = Vec.getSimpleValueType().getScalarType(); + MVT VT = MVT::getVectorVT(SVT, WideNumElts); + return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); +} + +// Helper function to collect subvector ops that are concated together, +// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. +// The subvectors in Ops are guaranteed to be the same type. +static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { + assert(Ops.empty() && "Expected an empty ops vector"); + + if (N->getOpcode() == ISD::CONCAT_VECTORS) { + Ops.append(N->op_begin(), N->op_end()); + return true; + } + + if (N->getOpcode() == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(2))) { + SDValue Src = N->getOperand(0); + SDValue Sub = N->getOperand(1); + const APInt &Idx = N->getConstantOperandAPInt(2); + EVT VT = Src.getValueType(); + EVT SubVT = Sub.getValueType(); + + // TODO - Handle more general insert_subvector chains. + if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && + Idx == (VT.getVectorNumElements() / 2) && + Src.getOpcode() == ISD::INSERT_SUBVECTOR && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + } + + return false; +} + // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in @@ -5457,19 +5558,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } -// Return true if the instruction zeroes the unused upper part of the -// destination and accepts mask. -static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { - switch (Opcode) { - default: - return false; - case X86ISD::CMPM: - case X86ISD::CMPM_RND: - case ISD::SETCC: - return true; - } -} - /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5626,10 +5714,29 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, - SelectionDAG &DAG) { +// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. +static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + return ISD::ANY_EXTEND_VECTOR_INREG; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + return ISD::ZERO_EXTEND_VECTOR_INREG; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + return ISD::SIGN_EXTEND_VECTOR_INREG; + } + llvm_unreachable("Unknown opcode"); +} + +static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, + SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); + assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || + ISD::ZERO_EXTEND == Opcode) && + "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -5642,13 +5749,10 @@ static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, InVT = In.getValueType(); } - if (VT.getVectorNumElements() == InVT.getVectorNumElements()) - return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, VT, In); + if (VT.getVectorNumElements() != InVT.getVectorNumElements()) + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); - return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); + return DAG.getNode(Opcode, DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -5686,18 +5790,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } -// Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops. -static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) { - while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) - V = V.getOperand(0); - return V; -} - -static const Constant *getTargetConstantFromNode(SDValue Op) { - Op = peekThroughBitcasts(Op); - - auto *Load = dyn_cast<LoadSDNode>(Op); - if (!Load) +static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { + if (!Load || !ISD::isNormalLoad(Load)) return nullptr; SDValue Ptr = Load->getBasePtr(); @@ -5712,6 +5806,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { return CNode->getConstVal(); } +static const Constant *getTargetConstantFromNode(SDValue Op) { + Op = peekThroughBitcasts(Op); + return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)); +} + +const Constant * +X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { + assert(LD && "Unexpected null LoadSDNode"); + return getTargetConstantFromNode(LD); +} + // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, @@ -5778,8 +5883,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; - APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); - EltBits[i] = Bits.getZExtValue(); + EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; @@ -5899,6 +6003,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + // Extract constant bits from a subvector broadcast. + if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { + SmallVector<APInt, 16> SubEltBits; + if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, SubEltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + UndefElts = APInt::getSplat(NumElts, UndefElts); + while (EltBits.size() < NumElts) + EltBits.append(SubEltBits.begin(), SubEltBits.end()); + return true; + } + } + // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && @@ -5914,6 +6031,29 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return CastBitData(UndefSrcElts, SrcEltBits); } + // Insert constant bits from a base and sub vector sources. + if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(Op.getOperand(2))) { + // TODO - support insert_subvector through bitcasts. + if (EltSizeInBits != VT.getScalarSizeInBits()) + return false; + + APInt UndefSubElts; + SmallVector<APInt, 32> EltSubBits; + if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, + UndefSubElts, EltSubBits, + AllowWholeUndefs, AllowPartialUndefs) && + getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, + UndefElts, EltBits, AllowWholeUndefs, + AllowPartialUndefs)) { + unsigned BaseIdx = Op.getConstantOperandVal(2); + UndefElts.insertBits(UndefSubElts, BaseIdx); + for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) + EltBits[BaseIdx + i] = EltSubBits[i]; + return true; + } + } + // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa<ConstantSDNode>(Op.getOperand(1))) { @@ -6068,6 +6208,34 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, } } +// Split the demanded elts of a HADD/HSUB node between its operands. +static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, + APInt &DemandedLHS, APInt &DemandedRHS) { + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumEltsPerLane = NumElts / NumLanes; + int HalfEltsPerLane = NumEltsPerLane / 2; + + DemandedLHS = APInt::getNullValue(NumElts); + DemandedRHS = APInt::getNullValue(NumElts); + + // Map DemandedElts to the horizontal operands. + for (int Idx = 0; Idx != NumElts; ++Idx) { + if (!DemandedElts[Idx]) + continue; + int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; + int LocalIdx = Idx % NumEltsPerLane; + if (LocalIdx < HalfEltsPerLane) { + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } else { + LocalIdx -= HalfEltsPerLane; + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); + } + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -6468,14 +6636,15 @@ static bool setTargetShuffleZeroElements(SDValue N, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - const SelectionDAG &DAG); + SelectionDAG &DAG); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. -static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, +static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, + SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -6483,8 +6652,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && - "Expected byte aligned value types"); + if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) + return false; + assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned Opcode = N.getOpcode(); switch (Opcode) { @@ -6524,6 +6694,40 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::OR: { + // Inspect each operand at the byte level. We can merge these into a + // blend shuffle mask if for each byte at least one is masked out (zero). + KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); + KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); + if (Known0.One.isNullValue() && Known1.One.isNullValue()) { + bool IsByteMask = true; + unsigned NumSizeInBytes = NumSizeInBits / 8; + unsigned NumBytesPerElt = NumBitsPerElt / 8; + APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); + APInt SelectMask = APInt::getNullValue(NumBytesPerElt); + for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { + unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); + unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); + if (LHS == 255 && RHS == 0) + SelectMask.setBit(i); + else if (LHS == 255 && RHS == 255) + ZeroMask.setBit(i); + else if (!(LHS == 0 && RHS == 255)) + IsByteMask = false; + } + if (IsByteMask) { + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { + for (unsigned j = 0; j != NumBytesPerElt; ++j) { + unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); + int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); + Mask.push_back(Idx); + } + } + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + return true; + } + } + // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); @@ -6558,9 +6762,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::INSERT_SUBVECTOR: { - // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where - // SRC0/SRC1 are both of the same valuetype VT. - // TODO - add peekThroughOneUseBitcasts support. SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); @@ -6568,28 +6769,57 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, if (!isa<ConstantSDNode>(N.getOperand(2)) || !N->isOnlyUserOf(Sub.getNode())) return false; + uint64_t InsertIdx = N.getConstantOperandVal(2); + // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0).getValueType() == VT && + isa<ConstantSDNode>(Sub.getOperand(1))) { + uint64_t ExtractIdx = Sub.getConstantOperandVal(1); + for (int i = 0; i != (int)NumElts; ++i) + Mask.push_back(i); + for (int i = 0; i != (int)NumSubElts; ++i) + Mask[InsertIdx + i] = NumElts + ExtractIdx + i; + Ops.push_back(Src); + Ops.push_back(Sub.getOperand(0)); + return true; + } + // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector<int, 64> SubMask; SmallVector<SDValue, 2> SubInputs; - if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) || - SubMask.size() != NumSubElts) + if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG)) return false; + if (SubMask.size() != NumSubElts) { + assert(((SubMask.size() % NumSubElts) == 0 || + (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); + if ((NumSubElts % SubMask.size()) == 0) { + int Scale = NumSubElts / SubMask.size(); + SmallVector<int,64> ScaledSubMask; + scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask); + SubMask = ScaledSubMask; + } else { + int Scale = SubMask.size() / NumSubElts; + NumSubElts = SubMask.size(); + NumElts *= Scale; + InsertIdx *= Scale; + } + } Ops.push_back(Src); for (SDValue &SubInput : SubInputs) { - if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR || - SubInput.getOperand(0).getValueType() != VT || - !isa<ConstantSDNode>(SubInput.getOperand(1))) - return false; - Ops.push_back(SubInput.getOperand(0)); + EVT SubSVT = SubInput.getValueType().getScalarType(); + EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, + NumSizeInBits / SubSVT.getSizeInBits()); + Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, + DAG.getUNDEF(AltVT), SubInput, + DAG.getIntPtrConstant(0, SDLoc(N)))); } - int InsertIdx = N.getConstantOperandVal(2); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; - int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1); - M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts); + M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } @@ -6674,16 +6904,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); + APInt EltsLHS, EltsRHS; + getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); + // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { - if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) || - (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)) + if ((!N0.isUndef() && + DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || + (!N1.isUndef() && + DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) || - (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask))) + if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || + (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) return false; } @@ -6728,15 +6963,54 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, } return true; } - case ISD::ZERO_EXTEND_VECTOR_INREG: - case ISD::ZERO_EXTEND: { - // TODO - add support for VPMOVZX with smaller input vector types. + case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); - if (NumSizeInBits != SrcVT.getSizeInBits()) - break; - DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, + if (!SrcVT.isVector()) + return false; + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal broadcast type"); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumSizeInBits / SrcVT.getScalarSizeInBits()); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + + Ops.push_back(Src); + Mask.append(NumElts, 0); + return true; + } + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::ANY_EXTEND_VECTOR_INREG: { + SDValue Src = N.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // Extended source must be a simple vector. + if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || + (SrcVT.getScalarSizeInBits() % 8) != 0) + return false; + + unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); + bool IsAnyExtend = + (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); + DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, Mask); + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal zero-extension type"); + SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), + NumSizeInBits / NumSrcBitsPerElt); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + Ops.push_back(Src); return true; } @@ -6745,7 +7019,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return false; } -/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask) { int MaskWidth = Mask.size(); @@ -6761,13 +7035,28 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, M = SM_SentinelUndef; // Check for unused inputs. - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedInputs.push_back(Inputs[i]); + if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; continue; } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; + + // Check for repeated inputs. + bool IsRepeat = false; + for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { + if (UsedInputs[j] != Inputs[i]) + continue; + for (int &M : Mask) + if (lo <= M) + M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); + IsRepeat = true; + break; + } + if (IsRepeat) + continue; + + UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } @@ -6780,9 +7069,11 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - const SelectionDAG &DAG) { + SelectionDAG &DAG) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) + if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); @@ -6838,6 +7129,28 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, Depth+1); } + // Recurse into insert_subvector base/sub vector to find scalars. + if (Opcode == ISD::INSERT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(2))) { + SDValue Vec = N->getOperand(0); + SDValue Sub = N->getOperand(1); + EVT SubVT = Sub.getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + uint64_t SubIdx = N->getConstantOperandVal(2); + + if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) + return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); + return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); + } + + // Recurse into extract_subvector src vector to find scalars. + if (Opcode == ISD::EXTRACT_SUBVECTOR && + isa<ConstantSDNode>(N->getOperand(1))) { + SDValue Src = N->getOperand(0); + uint64_t SrcIdx = N->getConstantOperandVal(1); + return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); + } + // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); @@ -6880,7 +7193,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register - // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. + // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) @@ -6889,7 +7202,6 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } @@ -6916,50 +7228,51 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDLoc dl(Op); SDValue V; - bool First = true; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. - for (unsigned i = 0; i < 16; ++i) { + for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; - if (ThisIsNonZero && First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; + if (!ThisIsNonZero && !NextIsNonZero) + continue; + + // FIXME: Investigate combining the first 4 bytes as a i32 instead. + SDValue Elt; + if (ThisIsNonZero) { + if (NumZero || NextIsNonZero) + Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else - V = DAG.getUNDEF(MVT::v8i16); - First = false; + Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } - if ((i & 1) != 0) { - // FIXME: Investigate extending to i32 instead of just i16. - // FIXME: Investigate combining the first 4 bytes as a i32 instead. - SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; - if (LastIsNonZero) { - LastElt = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); - } - if (ThisIsNonZero) { - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, - DAG.getConstant(8, dl, MVT::i8)); - if (LastIsNonZero) - ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); - } else - ThisElt = LastElt; - - if (ThisElt) { - if (1 == i) { - V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) - : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v8i16, V); - } else { - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i / 2, dl)); - } + if (NextIsNonZero) { + SDValue NextElt = Op.getOperand(i + 1); + if (i == 0 && NumZero) + NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); + else + NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); + NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, + DAG.getConstant(8, dl, MVT::i8)); + if (ThisIsNonZero) + Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); + else + Elt = NextElt; + } + + // If our first insertion is not the first index then insert into zero + // vector to break any register dependency else use SCALAR_TO_VECTOR. + if (!V) { + if (i != 0) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); + V = DAG.getBitcast(MVT::v8i16, V); + continue; } } + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, + DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); @@ -7002,9 +7315,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, } // Find all zeroable elements. - std::bitset<4> Zeroable; - for (int i=0; i < 4; ++i) { - SDValue Elt = Op->getOperand(i); + std::bitset<4> Zeroable, Undefs; + for (int i = 0; i < 4; ++i) { + SDValue Elt = Op.getOperand(i); + Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && @@ -7014,10 +7328,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; - for (unsigned i=0; i < 4; ++i) { + for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; - SDValue Elt = Op->getOperand(i); + SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa<ConstantSDNode>(Elt.getOperand(1))) return SDValue(); @@ -7056,10 +7370,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. - SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + SDValue VZeroOrUndef = (Zeroable == Undefs) + ? DAG.getUNDEF(VT) + : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. @@ -7079,7 +7395,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; - CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); + CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) @@ -7200,9 +7516,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, unsigned NumElems = Elts.size(); int LastLoadedElt = -1; - SmallBitVector LoadMask(NumElems, false); - SmallBitVector ZeroMask(NumElems, false); - SmallBitVector UndefMask(NumElems, false); + APInt LoadMask = APInt::getNullValue(NumElems); + APInt ZeroMask = APInt::getNullValue(NumElems); + APInt UndefMask = APInt::getNullValue(NumElems); + + SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7210,38 +7528,52 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); + if (Elt.isUndef()) { + UndefMask.setBit(i); + continue; + } + if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { + ZeroMask.setBit(i); + continue; + } - if (Elt.isUndef()) - UndefMask[i] = true; - else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) - ZeroMask[i] = true; - else if (ISD::isNON_EXTLoad(Elt.getNode())) { - LoadMask[i] = true; - LastLoadedElt = i; - // Each loaded element must be the correct fractional portion of the - // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) - return SDValue(); - } else + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + return SDValue(); + + if (!ISD::isNON_EXTLoad(Elt.getNode())) return SDValue(); + + Loads[i] = cast<LoadSDNode>(Elt); + LoadMask.setBit(i); + LastLoadedElt = i; } - assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && + assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + + LoadMask.countPopulation()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. - if (UndefMask.count() == NumElems) + if (UndefMask.countPopulation() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? - if ((ZeroMask | UndefMask).count() == NumElems) + if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - int FirstLoadedElt = LoadMask.find_first(); + int FirstLoadedElt = LoadMask.countTrailingZeros(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); - LoadSDNode *LDBase = cast<LoadSDNode>(EltBase); - EVT LDBaseVT = EltBase.getValueType(); + EVT EltBaseVT = EltBase.getValueType(); + assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && + "Register/Memory size mismatch"); + LoadSDNode *LDBase = Loads[FirstLoadedElt]; + assert(LDBase && "Did not find base load for merging consecutive loads"); + unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); + unsigned BaseSizeInBytes = BaseSizeInBits / 8; + int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; + assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a @@ -7250,11 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - SDValue Elt = peekThroughBitcasts(Elts[i]); - LoadSDNode *LD = cast<LoadSDNode>(Elt); - if (!DAG.areNonVolatileConsecutiveLoads( - LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, - i - FirstLoadedElt)) { + if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, + i - FirstLoadedElt)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; @@ -7264,11 +7593,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - SmallVector<LoadSDNode *, 8> Loads; - for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) - if (LoadMask[i]) - Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i]))); - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && @@ -7277,23 +7601,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, NewLd); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; - // LOAD - all consecutive load/undefs (must start/end with a load). - // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. - // If the vector contains zeros, then attempt to shuffle those elements. - if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + // Check if the base load is entirely dereferenceable. + bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( + VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); + + // LOAD - all consecutive load/undefs (must start/end with a load or be + // entirely dereferenceable). If we have found an entire vector of loads and + // undefs, then return a large load of the entire vector width starting at the + // base pointer. If the vector contains zeros, then attempt to shuffle those + // elements. + if (FirstLoadedElt == 0 && + (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { - assert(LDBase && "Did not find base load for merging consecutive loads"); - EVT EltVT = LDBase->getValueType(0); - // Ensure that the input vector size for the merged loads matches the - // cumulative size of the input elements. - if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) - return SDValue(); - if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); @@ -7303,12 +7627,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); - if (IsConsecutiveLoad) + if (NumElems == 1) + return DAG.getBitcast(VT, Elts[FirstLoadedElt]); + + if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. - if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { + if (!isAfterLegalize && VT.isVector()) { SmallVector<int, 4> ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) @@ -7323,16 +7650,28 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - int LoadSize = - (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); + // If the upper half of a ymm/zmm load is undef then just load the lower half. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned HalfNumElems = NumElems / 2; + if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { + EVT HalfVT = + EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); + SDValue HalfLD = + EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, + DAG, Subtarget, isAfterLegalize); + if (HalfLD) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), + HalfLD, DAG.getIntPtrConstant(0, DL)); + } + } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && - (LoadSize == 32 || LoadSize == 64) && + (LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) - : MVT::getIntegerVT(LoadSize); - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); + MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) + : MVT::getIntegerVT(LoadSizeInBits); + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -7342,14 +7681,85 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) - DAG.makeEquivalentMemoryOrdering(LD, ResNode); + if (LD) + DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } + // BROADCAST - match the smallest possible repetition pattern, load that + // scalar/subvector element and then broadcast to the entire vector. + if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && + (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { + for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { + unsigned RepeatSize = SubElems * BaseSizeInBits; + unsigned ScalarSize = std::min(RepeatSize, 64u); + if (!Subtarget.hasAVX2() && ScalarSize < 32) + continue; + + bool Match = true; + SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); + for (unsigned i = 0; i != NumElems && Match; ++i) { + if (!LoadMask[i]) + continue; + SDValue Elt = peekThroughBitcasts(Elts[i]); + if (RepeatedLoads[i % SubElems].isUndef()) + RepeatedLoads[i % SubElems] = Elt; + else + Match &= (RepeatedLoads[i % SubElems] == Elt); + } + + // We must have loads at both ends of the repetition. + Match &= !RepeatedLoads.front().isUndef(); + Match &= !RepeatedLoads.back().isUndef(); + if (!Match) + continue; + + EVT RepeatVT = + VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) + ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) + : EVT::getFloatingPointVT(ScalarSize); + if (RepeatSize > ScalarSize) + RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, + RepeatSize / ScalarSize); + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), + VT.getSizeInBits() / ScalarSize); + if (TLI.isTypeLegal(BroadcastVT)) { + if (SDValue RepeatLoad = EltsFromConsecutiveLoads( + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { + unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST + : X86ISD::VBROADCAST; + SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); + return DAG.getBitcast(VT, Broadcast); + } + } + } + } + return SDValue(); } +// Combine a vector ops (shuffles etc.) that is equal to build_vector load1, +// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses +// are consecutive, non-overlapping, and in the right order. +static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool isAfterLegalize) { + SmallVector<SDValue, 64> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + return SDValue(); + } + assert(Elts.size() == VT.getVectorNumElements()); + return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, + isAfterLegalize); +} + static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); @@ -7373,12 +7783,20 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue, return ConstantVector::get(ArrayRef<Constant *>(ConstantVec)); } -static bool isUseOfShuffle(SDNode *N) { +static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { - if (isTargetShuffle(U->getOpcode())) + unsigned Opc = U->getOpcode(); + // VPERMV/VPERMV3 shuffles can never fold their index operands. + if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) + return false; + if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) + return false; + if (isTargetShuffle(Opc)) + return true; + if (Opc == ISD::BITCAST) // Ignore bitcasts + return isFoldableUseOfShuffle(U); + if (N->hasOneUse()) return true; - if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts - return isUseOfShuffle(U); } return false; } @@ -7486,7 +7904,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. - if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) + if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -7581,7 +7999,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -8330,6 +8748,22 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); + unsigned NumElts = VT.getVectorNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + for (unsigned i = 0; i != NumElts; ++i) + if (BV->getOperand(i).isUndef()) + DemandedElts.clearBit(i); + + // If we don't need the upper xmm, then perform as a xmm hop. + unsigned HalfNumElts = NumElts / 2; + if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); + V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); + V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); + SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); + return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); + } + return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); } @@ -8338,11 +8772,8 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. - unsigned NumNonUndefs = 0; - for (const SDValue &V : BV->op_values()) - if (!V.isUndef()) - ++NumNonUndefs; - + unsigned NumNonUndefs = + count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); @@ -8350,23 +8781,15 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); - unsigned HOpcode; - SDValue V0, V1; - if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) - if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) - return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); - - if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2()) + if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || + ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || + ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || + ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { + unsigned HOpcode; + SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); + } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) @@ -8481,9 +8904,15 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). + bool IsShift = false; switch (Opcode) { default: return SDValue(); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + IsShift = true; + break; case ISD::AND: case ISD::XOR: case ISD::OR: @@ -8504,10 +8933,24 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, // We expect the canonicalized RHS operand to be the constant. if (!isa<ConstantSDNode>(RHS)) return SDValue(); + + // Extend shift amounts. + if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { + if (!IsShift) + return SDValue(); + RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); + } + LHSElts.push_back(LHS); RHSElts.push_back(RHS); } + // Limit to shifts by uniform immediates. + // TODO: Only accept vXi8/vXi64 special cases? + // TODO: Permit non-uniform XOP/AVX2/MULLO cases? + if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) + return SDValue(); + SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); @@ -9288,60 +9731,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, return Vec; } -// Return true if all the operands of the given CONCAT_VECTORS node are zeros -// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) -static bool isExpandWithZeros(const SDValue &Op) { - assert(Op.getOpcode() == ISD::CONCAT_VECTORS && - "Expand with zeros only possible in CONCAT_VECTORS nodes!"); - - for (unsigned i = 1; i < Op.getNumOperands(); i++) - if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) - return false; - - return true; -} - // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. -static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { - unsigned Opc = Op.getOpcode(); - - assert(Opc == ISD::CONCAT_VECTORS && - Op.getSimpleValueType().getVectorElementType() == MVT::i1 && - "Unexpected node to check for type promotion!"); - - // As long as we are concatenating zeros to the upper part of a previous node - // result, climb up the tree until a node with different opcode is - // encountered - while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { - if (Opc == ISD::INSERT_SUBVECTOR) { - if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && - Op.getConstantOperandVal(2) == 0) - Op = Op.getOperand(1); - else - return SDValue(); - } else { // Opc == ISD::CONCAT_VECTORS - if (isExpandWithZeros(Op)) - Op = Op.getOperand(0); - else - return SDValue(); - } - Opc = Op.getOpcode(); - } - - // Check if the first inserted node zeroes the upper bits, or an 'and' result - // of a node that zeros the upper bits (its masked version). - if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || - (Op.getOpcode() == ISD::AND && - (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || - isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { - return Op; - } - - return SDValue(); -} - // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, @@ -9353,13 +9745,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - // If this node promotes - by concatenating zeroes - the type of the result - // of a node with instruction that zeroes all upper (irrelevant) bits of the - // output register, mark it as legal and catch the pattern in instruction - // selection to avoid emitting extra instructions (for zeroing upper bits). - if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) - return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl); - unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; @@ -9618,6 +10003,8 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; + assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && + "Illegal target shuffle mask"); for (int i = 0; i < Size; ++i) if (Mask[i] == SM_SentinelUndef) @@ -9687,6 +10074,40 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { return IsUnpackwdMask; } +static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { + // Create 128-bit vector type based on mask size. + MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); + MVT VT = MVT::getVectorVT(EltVT, Mask.size()); + + // We can't assume a canonical shuffle mask, so try the commuted version too. + SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + + // Match any of unary/binary or low/high. + for (unsigned i = 0; i != 4; ++i) { + SmallVector<int, 16> UnpackMask; + createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); + if (isTargetShuffleEquivalent(Mask, UnpackMask) || + isTargetShuffleEquivalent(CommutedMask, UnpackMask)) + return true; + } + return false; +} + +/// Return true if a shuffle mask chooses elements identically in its top and +/// bottom halves. For example, any splat mask has the same top and bottom +/// halves. If an element is undefined in only one half of the mask, the halves +/// are not considered identical. +static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) { + assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); + unsigned HalfSize = Mask.size() / 2; + for (unsigned i = 0; i != HalfSize; ++i) { + if (Mask[i] != Mask[i + HalfSize]) + return false; + } + return true; +} + /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -9826,12 +10247,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable, } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. -static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; @@ -9885,11 +10305,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND -static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const APInt &Zeroable, - ArrayRef<int> Mask, SDValue &V1, - SDValue &V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, + const APInt &Zeroable, + ArrayRef<int> Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) @@ -9905,9 +10325,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getSelect(DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -9997,9 +10415,9 @@ static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. -static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, SDValue V2, + SelectionDAG &DAG) { SmallVector<int, 8> Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) @@ -10061,10 +10479,10 @@ static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, // // But when avx512vl is available, one can just use a single vpmovdw // instruction. -static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (VT != MVT::v16i8 && VT != MVT::v8i16) return SDValue(); @@ -10169,10 +10587,9 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, return false; } -static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, @@ -10187,14 +10604,32 @@ static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { - assert(!VT.isFloatingPoint() && "Floating point types are not supported"); +static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); - SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); + SDValue Zero, AllOnes; + // Use f64 if i64 isn't legal. + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + MaskVT = MVT::getVectorVT(EltVT, Mask.size()); + } + + MVT LogicVT = VT; + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + Zero = DAG.getConstantFP(0.0, DL, EltVT); + AllOnes = DAG.getConstantFP( + APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); + LogicVT = + MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); + } else { + Zero = DAG.getConstant(0, DL, EltVT); + AllOnes = DAG.getAllOnesConstant(DL, EltVT); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -10212,8 +10647,11 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - return DAG.getNode(ISD::AND, DL, VT, V, VMask); + SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); + VMask = DAG.getBitcast(LogicVT, VMask); + V = DAG.getBitcast(LogicVT, V); + SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); + return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. @@ -10221,9 +10659,9 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. -static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); @@ -10305,11 +10743,11 @@ static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Original, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); uint64_t BlendMask = 0; @@ -10325,45 +10763,24 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v8f32: + assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); + LLVM_FALLTHROUGH; + case MVT::v2f64: case MVT::v2i64: + case MVT::v4f32: case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget.hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - LLVM_FALLTHROUGH; - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } + case MVT::v8i16: + assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10391,14 +10808,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget.hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); + case MVT::v32i8: + assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); + LLVM_FALLTHROUGH; + case MVT::v16i8: { + assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { @@ -10456,6 +10874,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { + // Attempt to lower to a bitmask if we can. Only if not optimizing for size. + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!OptForSize) { + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return Masked; + } + + // Otherwise load an immediate into a GPR, cast to k-register, and use a + // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); @@ -10471,11 +10899,11 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. -static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG, - bool ImmBlends = false) { +static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG, + bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector<int, 32> BlendMask(Mask.size(), -1); @@ -10510,10 +10938,10 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. -static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; @@ -10573,7 +11001,7 @@ static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. -static SDValue lowerVectorShuffleAsByteRotateAndPermute( +static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || @@ -10664,7 +11092,7 @@ static SDValue lowerVectorShuffleAsByteRotateAndPermute( /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. -static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( +static SDValue lowerShuffleAsDecomposedShuffleBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and @@ -10688,18 +11116,18 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, VT, V1, V2, Mask, DAG, true)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG, true)) return BlendPerm; - if (SDValue UnpackPerm = - lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, + DAG)) return UnpackPerm; - if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. - if (SDValue BlendPerm = - lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, + DAG)) return BlendPerm; } @@ -10711,8 +11139,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend( /// Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. -static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, - ArrayRef<int> Mask) { +static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: @@ -10796,8 +11223,8 @@ static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask) { +static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; @@ -10807,7 +11234,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; - int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; @@ -10818,15 +11245,14 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, return Rotation * Scale; } -static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; - int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); + int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); @@ -10874,11 +11300,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); @@ -10887,7 +11312,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; - int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); @@ -10895,6 +11320,69 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, DAG.getConstant(Rotation, DL, MVT::i8)); } +/// Try to lower a vector shuffle as a byte shift sequence. +static SDValue lowerVectorShuffleAsByteShiftMask( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + assert(VT.is128BitVector() && "Only 128-bit vectors supported"); + + // We need a shuffle that has zeros at one/both ends and a sequential + // shuffle from one source within. + unsigned ZeroLo = Zeroable.countTrailingOnes(); + unsigned ZeroHi = Zeroable.countLeadingOnes(); + if (!ZeroLo && !ZeroHi) + return SDValue(); + + unsigned NumElts = Mask.size(); + unsigned Len = NumElts - (ZeroLo + ZeroHi); + if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) + return SDValue(); + + unsigned Scale = VT.getScalarSizeInBits() / 8; + ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len); + if (!isUndefOrInRange(StubMask, 0, NumElts) && + !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) + return SDValue(); + + SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; + Res = DAG.getBitcast(MVT::v16i8, Res); + + // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an + // inner sequential set of elements, possibly offset: + // 01234567 --> zzzzzz01 --> 1zzzzzzz + // 01234567 --> 4567zzzz --> zzzzz456 + // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz + if (ZeroLo == 0) { + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + } else if (ZeroHi == 0) { + unsigned Shift = Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else if (!Subtarget.hasSSSE3()) { + // If we don't have PSHUFB then its worth avoiding an AND constant mask + // by performing 3 byte shifts. Shuffle combining can kick in above that. + // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Shift += Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else + return SDValue(); + + return DAG.getBitcast(VT, Res); +} + /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -10918,11 +11406,10 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] -static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, - unsigned ScalarSizeInBits, - ArrayRef<int> Mask, int MaskOffset, - const APInt &Zeroable, - const X86Subtarget &Subtarget) { +static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, + unsigned ScalarSizeInBits, ArrayRef<int> Mask, + int MaskOffset, const APInt &Zeroable, + const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -10981,11 +11468,11 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, return -1; } -static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -10994,14 +11481,13 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, unsigned Opcode; // Try to match shuffle against V1 shift. - int ShiftAmt = matchVectorShuffleAsShift( - ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { - ShiftAmt = - matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), - Mask, Size, Zeroable, Subtarget); + ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, Size, Zeroable, Subtarget); V = V2; } @@ -11018,16 +11504,16 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. -static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask, uint64_t &BitLen, - uint64_t &BitIdx, const APInt &Zeroable) { +static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the @@ -11074,15 +11560,15 @@ static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } -static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef<int> Mask, uint64_t &BitLen, - uint64_t &BitIdx) { +static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. - if (!isUndefInRange(Mask, HalfSize, HalfSize)) + if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { @@ -11140,17 +11626,16 @@ static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); - if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getConstant(BitLen, DL, MVT::i8), @@ -11168,7 +11653,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. -static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( +static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); @@ -11203,6 +11688,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. + // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. @@ -11211,7 +11697,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11234,7 +11720,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; - unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); + unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), @@ -11253,8 +11739,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || - !SafeOffset(Offset + 1)) + if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; @@ -11326,7 +11811,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. -static SDValue lowerVectorShuffleAsZeroOrAnyExtend( +static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11397,8 +11882,8 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (Offset != 0 && Matches < 2) return SDValue(); - return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); + return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, + InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -11482,7 +11967,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. -static SDValue lowerVectorShuffleAsElementInsertion( +static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11580,10 +12065,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. -static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, - SDValue V0, int BroadcastIdx, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); @@ -11629,16 +12114,90 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } +/// Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + +/// If we are extracting two 128-bit halves of a vector and shuffling the +/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a +/// multi-shuffle lowering. +static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, + SDValue N1, ArrayRef<int> Mask, + SelectionDAG &DAG) { + EVT VT = N0.getValueType(); + assert((VT.is128BitVector() && + (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && + "VPERM* family of shuffles requires 32-bit or 64-bit elements"); + + // Check that both sources are extracts of the same source vector. + if (!N0.hasOneUse() || !N1.hasOneUse() || + N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + N0.getOperand(0) != N1.getOperand(0)) + return SDValue(); + + SDValue WideVec = N0.getOperand(0); + EVT WideVT = WideVec.getValueType(); + if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) || + !isa<ConstantSDNode>(N1.getOperand(1))) + return SDValue(); + + // Match extracts of each half of the wide source vector. Commute the shuffle + // if the extract of the low half is N1. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 4> NewMask(Mask.begin(), Mask.end()); + const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); + const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); + if (ExtIndex1 == 0 && ExtIndex0 == NumElts) + ShuffleVectorSDNode::commuteMask(NewMask); + else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) + return SDValue(); + + // Final bailout: if the mask is simple, we are better off using an extract + // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps + // because that avoids a constant load from memory. + if (NumElts == 4 && + (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) + return SDValue(); + + // Extend the shuffle mask with undef elements. + NewMask.append(NumElts, -1); + + // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 + SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), + NewMask); + // This is free: ymm -> xmm. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, + DAG.getIntPtrConstant(0, DL)); +} + /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) @@ -11647,6 +12206,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); + unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; @@ -11670,29 +12230,19 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { - // Peek through bitcasts as long as BroadcastIdx can be adjusted. - SDValue VSrc = V.getOperand(0); - unsigned NumEltBits = V.getScalarValueSizeInBits(); - unsigned NumSrcBits = VSrc.getScalarValueSizeInBits(); - if ((NumEltBits % NumSrcBits) == 0) - BroadcastIdx *= (NumEltBits / NumSrcBits); - else if ((NumSrcBits % NumEltBits) == 0 && - (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0) - BroadcastIdx /= (NumSrcBits / NumEltBits); - else - break; - V = VSrc; + V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { - int OperandSize = - V.getOperand(0).getSimpleValueType().getVectorNumElements(); - V = V.getOperand(BroadcastIdx / OperandSize); - BroadcastIdx %= OperandSize; + int OpBitWidth = V.getOperand(0).getValueSizeInBits(); + int OpIdx = BitOffset / OpBitWidth; + V = V.getOperand(OpIdx); + BitOffset %= OpBitWidth; continue; } case ISD::INSERT_SUBVECTOR: { @@ -11701,11 +12251,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (!ConstantIdx) break; - int BeginIdx = (int)ConstantIdx->getZExtValue(); - int EndIdx = - BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); - if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { - BroadcastIdx -= BeginIdx; + int EltBitWidth = VOuter.getScalarValueSizeInBits(); + int Idx = (int)ConstantIdx->getZExtValue(); + int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); + int BeginOffset = Idx * EltBitWidth; + int EndOffset = BeginOffset + NumSubElts * EltBitWidth; + if (BeginOffset <= BitOffset && BitOffset < EndOffset) { + BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; @@ -11715,48 +12267,34 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } break; } + assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); + BroadcastIdx = BitOffset / NumEltBits; - // Ensure the source vector and BroadcastIdx are for a suitable type. - if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) { - unsigned NumEltBits = VT.getScalarSizeInBits(); - unsigned NumSrcBits = V.getScalarValueSizeInBits(); - if ((NumSrcBits % NumEltBits) == 0) - BroadcastIdx *= (NumSrcBits / NumEltBits); - else if ((NumEltBits % NumSrcBits) == 0 && - (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0) - BroadcastIdx /= (NumEltBits / NumSrcBits); - else - return SDValue(); - - unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; - MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts); - V = DAG.getBitcast(SrcVT, V); - } + // Do we need to bitcast the source to retrieve the original broadcast index? + bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. - // First, look through bitcast: if the original value has a larger element - // type than the shuffle, the broadcast element is in essence truncated. - // Make that explicit to ease folding. - if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) - if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( - DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + // If the original value has a larger element type than the shuffle, the + // broadcast element is in essence truncated. Make that explicit to ease + // folding. + if (BitCastSrc && VT.isInteger()) + if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( + DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; - // Peek through any bitcast (only useful for loads). - SDValue BC = peekThroughBitcasts(V); - // Also check the simpler case, where we can directly reuse the scalar. - if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + if (!BitCastSrc && + ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { + } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -11767,10 +12305,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast<LoadSDNode>(BC); + LoadSDNode *Ld = cast<LoadSDNode>(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( @@ -11779,7 +12318,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); - } else if (BroadcastIdx != 0) { + } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. @@ -11791,18 +12330,15 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. - unsigned EltSize = VT.getScalarSizeInBits(); - if (((BroadcastIdx * EltSize) % 128) != 0) + if ((BitOffset % 128) != 0) return SDValue(); - // The shuffle input might have been a bitcast we looked through; look at - // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll - // later bitcast it to BroadcastVT. - assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() && - "Unexpected vector element size"); + assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && + "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); - V = extract128BitVector(V, BroadcastIdx, DAG, DL); + unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); + V = extract128BitVector(V, ExtractIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) @@ -11810,21 +12346,21 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. - MVT SrcVT = V.getSimpleValueType(); - if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { - assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { + assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); - if (SrcVT.isVector()) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); + MVT ExtVT; + if (V.getValueType().isVector()) { + unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; + ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { - SrcVT = BroadcastVT.getScalarType(); + ExtVT = BroadcastVT.getScalarType(); } - V = DAG.getBitcast(SrcVT, V); + V = DAG.getBitcast(ExtVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget.is64Bit() && SrcVT == MVT::i64) { + if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); @@ -11833,9 +12369,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. - if (SrcVT.getSizeInBits() > 128) { - MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), - 128 / SrcVT.getScalarSizeInBits()); + if (V.getValueSizeInBits() > 128) { + MVT ExtVT = V.getSimpleValueType().getScalarType(); + ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } @@ -11849,11 +12385,10 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, - unsigned &InsertPSMask, - const APInt &Zeroable, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const APInt &Zeroable, + ArrayRef<int> Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -11938,16 +12473,15 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef<int> Mask, const APInt &Zeroable, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; - if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. @@ -11964,7 +12498,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsPermuteAndUnpack( +static SDValue lowerShuffleAsPermuteAndUnpack( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && @@ -12079,19 +12613,18 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack( /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. -static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the @@ -12116,16 +12649,20 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12141,13 +12678,12 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); @@ -12161,19 +12697,18 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. -static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12193,20 +12728,24 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; @@ -12214,33 +12753,32 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. @@ -12252,36 +12790,14 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// Test whether this can be lowered with a single SHUFPS instruction. -/// -/// This is used to disable more specialized lowerings when the shufps lowering -/// will happen to be efficient. -static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { - // This routine only handles 128-bit shufps. - assert(Mask.size() == 4 && "Unsupported mask size!"); - assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); - assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); - assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); - assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); - - // To lower with a single SHUFPS we need to have the low half and high half - // each requiring a single input. - if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) - return false; - if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) - return false; - - return true; -} - /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. -static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; @@ -12366,11 +12882,10 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. -static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12379,8 +12894,8 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. @@ -12413,29 +12928,32 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. - if (SDValue V = - lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) - if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( - DL, MVT::v4f32, V1, V2, Mask, DAG)) + if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, + V2, Mask, DAG)) return BlendPerm; } @@ -12449,23 +12967,21 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. -static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); @@ -12473,16 +12989,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 @@ -12501,14 +13017,18 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasAVX2()) + if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) + return Extract; + // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -12516,29 +13036,28 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; } @@ -12549,12 +13068,12 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) return Unpack; } @@ -12585,7 +13104,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. -static SDValue lowerV8I16GeneralSingleInputVectorShuffle( +static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); @@ -12617,11 +13136,9 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); - int NumLToL = - std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; - int NumLToH = - std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); @@ -12730,7 +13247,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int ADWord, BDWord; + int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; @@ -12825,8 +13342,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -13084,7 +13600,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. -static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( +static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && @@ -13147,54 +13663,51 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. -static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, - DAG, Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, + Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end()); - return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, - MutableMask, Subtarget, - DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, + Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && @@ -13202,19 +13715,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, "shuffles."); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; @@ -13222,50 +13735,54 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return V; + // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG, V1InUse, V2InUse); + return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even @@ -13334,9 +13851,9 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, return 0; } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); @@ -13354,39 +13871,38 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. -static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use a zext lowering. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); @@ -13394,12 +13910,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. @@ -13492,13 +14007,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; } - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; + + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -13518,7 +14037,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool V1InUse = false; bool V2InUse = false; - SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( + SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, @@ -13526,8 +14045,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some @@ -13538,17 +14057,17 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. - if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute( + if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } @@ -13558,13 +14077,12 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion( + if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; - if (SDValue BitBlend = - lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) - return BitBlend; + if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for @@ -13605,8 +14123,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, + Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together @@ -13661,24 +14179,24 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. -static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: - return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: - return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: - return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: - return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: - return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: - return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); @@ -13690,9 +14208,9 @@ static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. -static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); @@ -13816,11 +14334,10 @@ static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. -static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); @@ -13845,8 +14362,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, return true; }; if (DoBothBroadcast()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -13860,12 +14377,12 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, + DAG); } /// Lower a vector shuffle crossing multiple 128-bit lanes as @@ -13874,9 +14391,9 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// -/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes, +/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. -static SDValue lowerVectorShuffleAsLanePermuteAndPermute( +static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); @@ -13940,11 +14457,9 @@ static SDValue lowerVectorShuffleAsLanePermuteAndPermute( /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. -static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue lowerShuffleAsLanePermuteAndBlend( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); @@ -13959,14 +14474,14 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] / LaneSize)] = true; if (!LaneUsed[0] || !LaneUsed[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } assert(V2.isUndef() && @@ -13990,11 +14505,11 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, } /// Handle lowering 2-lane 128-bit shuffles. -static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); @@ -14021,8 +14536,8 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask @@ -14093,9 +14608,7 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. -/// -/// FIXME: This should be generalized to 512-bit shuffles. -static SDValue lowerVectorShuffleByMerging128BitLanes( +static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); @@ -14104,12 +14617,10 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return SDValue(); int Size = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; int LaneSize = 128 / VT.getScalarSizeInBits(); - int NumLanes = Size / LaneSize; - assert(NumLanes == 2 && "Only handles 256-bit shuffles."); - SmallVector<int, 16> RepeatMask(LaneSize, -1); - int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } }; + SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. @@ -14120,7 +14631,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( int M = Mask[(Lane * LaneSize) + i]; if (M < 0) continue; - // Determine which of the 4 possible input lanes (2 from each source) + // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. @@ -14259,54 +14770,30 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } -/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. -/// This allows for fast cases such as subvector extraction/insertion -/// or shuffling smaller vector types which can lower more efficiently. -static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert((VT.is256BitVector() || VT.is512BitVector()) && - "Expected 256-bit or 512-bit vector"); - - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - - bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); - bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); - if (!UndefLower && !UndefUpper) - return SDValue(); - - // Upper half is undef and lower half is whole upper subvector. - // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (UndefUpper && - isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(HalfNumElts, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(0, DL)); - } - - // Lower half is undef and upper half is whole lower subvector. - // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - if (UndefLower && - isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { - SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, - DAG.getIntPtrConstant(HalfNumElts, DL)); - } +/// If the input shuffle mask results in a vector that is undefined in all upper +/// or lower half elements and that mask accesses only 2 halves of the +/// shuffle's operands, return true. A mask of half the width with mask indexes +/// adjusted to access the extracted halves of the original shuffle operands is +/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or +/// lower half of each input operand is accessed. +static bool +getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask, + int &HalfIdx1, int &HalfIdx2) { + assert((Mask.size() == HalfMask.size() * 2) && + "Expected input mask to be twice as long as output"); + + // Exactly one half of the result must be undef to allow narrowing. + bool UndefLower = isUndefLowerHalf(Mask); + bool UndefUpper = isUndefUpperHalf(Mask); + if (UndefLower == UndefUpper) + return false; - // If the shuffle only uses two of the four halves of the input operands, - // then extract them and perform the 'half' shuffle at half width. - // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> - int HalfIdx1 = -1, HalfIdx2 = -1; - SmallVector<int, 8> HalfMask(HalfNumElts); - unsigned Offset = UndefLower ? HalfNumElts : 0; + unsigned HalfNumElts = HalfMask.size(); + unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; + HalfIdx1 = -1; + HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { - int M = Mask[i + Offset]; + int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; @@ -14333,42 +14820,27 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, } // Too many half vectors referenced. - return SDValue(); + return false; } - assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); - // Only shuffle the halves of the inputs when useful. - int NumLowerHalves = - (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); - int NumUpperHalves = - (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); - - // uuuuXXXX - don't extract uppers just to insert again. - if (UndefLower && NumUpperHalves != 0) - return SDValue(); - - // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. - if (UndefUpper && NumUpperHalves == 2) - return SDValue(); + return true; +} - // AVX2 - XXXXuuuu - always extract lowers. - if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { - // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. - if (VT == MVT::v4f64 || VT == MVT::v4i64) - return SDValue(); - // AVX2 supports variable 32-bit element cross-lane shuffles. - if (VT == MVT::v8f32 || VT == MVT::v8i32) { - // XXXXuuuu - don't extract lowers and uppers. - if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) - return SDValue(); - } - } +/// Given the output values from getHalfShuffleMask(), create a half width +/// shuffle of extracted vectors followed by an insert back to full width. +static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, + ArrayRef<int> HalfMask, int HalfIdx1, + int HalfIdx2, bool UndefLower, + SelectionDAG &DAG) { + assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); + assert(V1.getValueType().isSimple() && "Expecting only simple types"); - // AVX512 - XXXXuuuu - always extract lowers. - if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0)) - return SDValue(); + MVT VT = V1.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); - auto GetHalfVector = [&](int HalfIdx) { + auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); @@ -14377,13 +14849,126 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, DAG.getIntPtrConstant(HalfIdx, DL)); }; - SDValue Half1 = GetHalfVector(HalfIdx1); - SDValue Half2 = GetHalfVector(HalfIdx2); + // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset + SDValue Half1 = getHalfVector(HalfIdx1); + SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } +/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.is256BitVector() || VT.is512BitVector()) && + "Expected 256-bit or 512-bit vector"); + + bool UndefLower = isUndefLowerHalf(Mask); + if (!UndefLower && !isUndefUpperHalf(Mask)) + return SDValue(); + + assert((!UndefLower || !isUndefUpperHalf(Mask)) && + "Completely undef shuffle mask should have been simplified already"); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + if (!UndefLower && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + int HalfIdx1, HalfIdx2; + SmallVector<int, 8> HalfMask(HalfNumElts); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) + return SDValue(); + + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + // Only shuffle the halves of the inputs when useful. + unsigned NumLowerHalves = + (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); + unsigned NumUpperHalves = + (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); + assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); + + // Determine the larger pattern of undef/halves, then decide if it's worth + // splitting the shuffle based on subtarget capabilities and types. + unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); + if (!UndefLower) { + // XXXXuuuu: no insert is needed. + // Always extract lowers when setting lower - these are all free subreg ops. + if (NumUpperHalves == 0) + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + + if (NumUpperHalves == 1) { + // AVX2 has efficient 32/64-bit element cross-lane shuffles. + if (Subtarget.hasAVX2()) { + // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. + if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && + !is128BitUnpackShuffleMask(HalfMask) && + (!isSingleSHUFPSMask(HalfMask) || + Subtarget.hasFastVariableShuffle())) + return SDValue(); + // If this is a unary shuffle (assume that the 2nd operand is + // canonicalized to undef), then we can use vpermpd. Otherwise, we + // are better off extracting the upper half of 1 operand and using a + // narrow shuffle. + if (EltWidth == 64 && V2.isUndef()) + return SDValue(); + } + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Extract + narrow shuffle is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // Don't extract both uppers, instead shuffle and then extract. + assert(NumUpperHalves == 2 && "Half vector count went wrong"); + return SDValue(); + } + + // UndefLower - uuuuXXXX: an insert to high half is required if we split this. + if (NumUpperHalves == 0) { + // AVX2 has efficient 64-bit element cross-lane shuffles. + // TODO: Refine to account for unary shuffle, splat, and other masks? + if (Subtarget.hasAVX2() && EltWidth == 64) + return SDValue(); + // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. + if (Subtarget.hasAVX512() && VT.is512BitVector()) + return SDValue(); + // Narrow shuffle + insert is better than the wide alternative. + return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, + UndefLower, DAG); + } + + // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. + return SDValue(); +} + /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -14569,9 +15154,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( SubLaneMask); } -static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, - unsigned &ShuffleImm, - ArrayRef<int> Mask) { +static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, + unsigned &ShuffleImm, ArrayRef<int> Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -14606,14 +15190,14 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, return false; } -static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; - if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, @@ -14624,23 +15208,22 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( - DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, + Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. @@ -14668,29 +15251,33 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( - DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) + if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, + Mask, DAG, Subtarget)) return V; // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, + Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14703,52 +15290,51 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) - return Result; + return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. -static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { @@ -14772,31 +15358,36 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + // If we have one input in place, then we can permute the other input and + // blend the result. + if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -14809,35 +15400,34 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. -static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more @@ -14858,13 +15448,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -14884,49 +15473,49 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. -static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -14935,8 +15524,8 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split @@ -14944,17 +15533,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more @@ -14970,30 +15559,29 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; } // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15015,31 +15603,30 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. -static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15048,37 +15635,36 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15091,12 +15677,12 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector<int, 8> RepeatedMask; @@ -15104,44 +15690,43 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. -static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15150,37 +15735,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -15192,36 +15776,36 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, + Subtarget); } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) - return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. - if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. - return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. @@ -15229,24 +15813,23 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we @@ -15260,12 +15843,12 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. - if (SDValue V = - lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) + if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; - if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), @@ -15277,17 +15860,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, switch (VT.SimpleTy) { case MVT::v4f64: - return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: - return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: - return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: - return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: - return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: - return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); @@ -15295,12 +15878,10 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } /// Try to lower a vector shuffle as a 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); @@ -15397,11 +15978,10 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, } /// Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15428,37 +16008,33 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, - Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = - lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15480,16 +16056,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) - return Unpck; + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. - return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); + return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the @@ -15501,19 +16076,18 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; - return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. -static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); @@ -15539,47 +16113,44 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable, - V1, V2, Subtarget, DAG)) + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, + V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use PALIGNR. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue Unpck = - lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, - V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -15587,7 +16158,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; @@ -15604,25 +16175,24 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, + Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. @@ -15630,27 +16200,26 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); - SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, - CastV1, CastV2, DAG); + SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, + CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; - return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. -static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); @@ -15659,23 +16228,22 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, + Subtarget, DAG)) return Rotate; if (V2.isUndef()) { @@ -15684,28 +16252,27 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. - return lowerV8I16GeneralSingleInputVectorShuffle( + return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; - return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. -static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const APInt &Zeroable, - SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); @@ -15714,37 +16281,36 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = - lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. - if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, - Subtarget)) + if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, + Subtarget)) return V; // Try to use shift instructions. - if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, + Subtarget, DAG)) return Rotate; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( - DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, + Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) - return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); + return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. @@ -15752,12 +16318,19 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Blend; + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (!V2.isUndef()) + if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// High-level routine to lower various 512-bit x86 vector shuffles. @@ -15765,11 +16338,11 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. -static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); @@ -15779,18 +16352,18 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = - lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for @@ -15799,17 +16372,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: - return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: - return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: - return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); @@ -15818,7 +16391,7 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified -// version of matchVectorShuffleAsShift. +// version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); @@ -15853,11 +16426,11 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask, // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. -static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); @@ -16046,15 +16619,14 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. - if (V2IsUndef) - for (int M : Mask) - if (M >= NumElements) { - SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); - for (int &M : NewMask) - if (M >= NumElements) - M = -1; - return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); - } + if (V2IsUndef && + any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); + } // Check for illegal shuffle mask element index values. int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; @@ -16092,8 +16664,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() @@ -16131,26 +16703,21 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); - if (SDValue V = - lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) - return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) - return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) - return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, - DAG); + return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -16410,7 +16977,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - DAG.getConstant(IdxVal, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); @@ -16536,10 +17103,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); - if (!isa<ConstantSDNode>(N2)) + + auto *N2C = dyn_cast<ConstantSDNode>(N2); + if (!N2C || N2C->getAPIntValue().uge(NumElts)) return SDValue(); - auto *N2C = cast<ConstantSDNode>(N2); - unsigned IdxVal = N2C->getZExtValue(); + uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); @@ -16584,13 +17152,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(IdxIn128, dl, MVT::i32)); + DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); + // This will be just movd/movq/movss/movsd. + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && + (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || + EltVT == MVT::i64)) { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + } + // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { @@ -16622,7 +17198,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -16672,7 +17248,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - assert(OpVT.is128BitVector() && "Expected an SSE type!"); + assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && + "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) @@ -16798,35 +17375,9 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue -X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { - const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); - - // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the - // global base reg. - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); - - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); - - SDLoc DL(Op); - Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); - - // With PIC, the address is actually $g + Offset. - if (OpFlag) { - Result = - DAG.getNode(ISD::ADD, DL, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); - } - - // For symbols that require a load from a stub to get the address, emit the - // load. - if (isGlobalStubReference(OpFlag)) - Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(DAG.getMachineFunction())); - - return Result; +SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue @@ -16850,35 +17401,67 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, - const SDLoc &dl, int64_t Offset, - SelectionDAG &DAG) const { - // Create the TargetGlobalAddress node, folding in the constant - // offset if it is legal. - unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); +/// Creates target global address or external symbol nodes for calls or +/// other uses. +SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, + bool ForCall) const { + // Unpack the global address or external symbol. + const SDLoc &dl = SDLoc(Op); + const GlobalValue *GV = nullptr; + int64_t Offset = 0; + const char *ExternalSym = nullptr; + if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) { + GV = G->getGlobal(); + Offset = G->getOffset(); + } else { + const auto *ES = cast<ExternalSymbolSDNode>(Op); + ExternalSym = ES->getSymbol(); + } + + // Calculate some flags for address lowering. + const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); + unsigned char OpFlags; + if (ForCall) + OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); + else + OpFlags = Subtarget.classifyGlobalReference(GV, Mod); + bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); + bool NeedsLoad = isGlobalStubReference(OpFlags); + CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; - if (OpFlags == X86II::MO_NO_FLAG && - X86::isOffsetSuitableForCodeModel(Offset, M)) { - // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); - Offset = 0; + + if (GV) { + // Create a target global address if this is a global. If possible, fold the + // offset into the global address reference. Otherwise, ADD it on later. + int64_t GlobalOffset = 0; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + std::swap(GlobalOffset, Offset); + } + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); + // If this is not a global address, this must be an external symbol. + Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } + // If this is a direct call, avoid the wrapper if we don't need to do any + // loads or adds. This allows SDAG ISel to match direct calls. + if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) + return Result; + Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. - if (isGlobalRelativeToPICBase(OpFlags)) { + if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. - if (isGlobalStubReference(OpFlags)) + if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); @@ -16893,9 +17476,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); - return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); + return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue @@ -17121,9 +17702,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } - if (Subtarget.isTargetKnownWindowsMSVC() || - Subtarget.isTargetWindowsItanium() || - Subtarget.isTargetWindowsGNU()) { + if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -17263,7 +17842,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, APInt APIntShiftAmt; if (isConstantSplat(Amt, APIntShiftAmt)) { - uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); + uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); } @@ -17276,7 +17855,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -17320,6 +17899,70 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, DAG.getIntPtrConstant(0, dl)); } +static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, + const X86Subtarget &Subtarget) { + switch (Opcode) { + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD + return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); + + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) + return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; + } +} + +/// Given a scalar cast operation that is extracted from a vector, try to +/// vectorize the cast op followed by extraction. This will avoid an expensive +/// round-trip between XMM and GPR. +static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: This could be enhanced to handle smaller integer types by peeking + // through an extend. + SDValue Extract = Cast.getOperand(0); + MVT DestVT = Cast.getSimpleValueType(); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Extract.getOperand(1))) + return SDValue(); + + // See if we have a 128-bit vector cast op for this type of cast. + SDValue VecOp = Extract.getOperand(0); + MVT FromVT = VecOp.getSimpleValueType(); + unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); + MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); + if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) + return SDValue(); + + // If we are extracting from a non-zero element, first shuffle the source + // vector to allow extracting from element zero. + SDLoc DL(Cast); + if (!isNullConstant(Extract.getOperand(1))) { + SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1); + Mask[0] = Extract.getConstantOperandVal(1); + VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); + } + // If the source vector is wider than 128-bits, extract the low part. Do not + // create an unnecessarily wide vector cast op. + if (FromVT != Vec128VT) + VecOp = extract128BitVector(VecOp, 0, DAG, DL); + + // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 + // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 + SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, + DAG.getIntPtrConstant(0, DL)); +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -17327,6 +17970,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, @@ -17380,23 +18026,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); - unsigned ByteSize = SrcVT.getSizeInBits()/8; + unsigned ByteSize = SrcVT.getSizeInBits() / 8; FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); - MachineMemOperand *MMO; + MachineMemOperand *LoadMMO; if (FI) { int SSFI = FI->getIndex(); - MMO = DAG.getMachineFunction().getMachineMemOperand( + LoadMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { - MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } - SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; - SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : - X86ISD::FILD, DL, - Tys, Ops, SrcVT, MMO); + SDValue FILDOps[] = {Chain, StackSlot}; + SDValue Result = + DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, + Tys, FILDOps, SrcVT, LoadMMO); if (useSSE) { Chain = Result.getValue(1); @@ -17406,20 +18052,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - unsigned SSFISize = Op.getValueSizeInBits()/8; + unsigned SSFISize = Op.getValueSizeInBits() / 8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); - SDValue Ops[] = { - Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag - }; - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; + MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); - Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, Op.getValueType(), MMO); + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, + Op.getValueType(), StoreMMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); @@ -17554,7 +18198,7 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); + SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); // Clear upper part of LO, lower HI. SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); @@ -17689,6 +18333,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); @@ -17741,7 +18388,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); - SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); @@ -17777,16 +18424,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), -// just return an <SDValue(), SDValue()> pair. +// just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 -// to i16, i32 or i64, and we lower it to a legal sequence. -// If lowered to the final integer result we return a <result, SDValue()> pair. -// Otherwise we lower it to a sequence ending with a FIST, return a -// <FIST, StackSlot> pair, and the caller is responsible for loading -// the final integer result from StackSlot. -std::pair<SDValue,SDValue> +// to i16, i32 or i64, and we lower it to a legal sequence and return the +// result. +SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { + bool IsSigned) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); @@ -17796,18 +18440,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. - return std::make_pair(SDValue(), SDValue()); + return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. - bool UnsignedFixup = !IsSigned && - DstTy == MVT::i64 && - (!Subtarget.is64Bit() || - !isScalarFPTypeInSSEReg(TheVT)); + bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; - if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { + if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); @@ -17818,30 +18459,13 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); - // These are really Legal. - if (DstTy == MVT::i32 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - if (Subtarget.is64Bit() && - DstTy == MVT::i64 && - isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) - return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); - unsigned MemSize = DstTy.getSizeInBits()/8; + unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } - SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. @@ -17883,9 +18507,10 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); - Adjust = DAG.getSelect(DL, MVT::i32, Cmp, - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0x80000000, DL, MVT::i32)); + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, + DAG.getConstant(0, DL, MVT::i64), + DAG.getConstant(APInt::getSignMask(64), + DL, MVT::i64)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), @@ -17893,81 +18518,52 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); - Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(MF, SSFI)); - SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); - SDValue Ops[] = { - Chain, StackSlot, DAG.getValueType(TheVT) - }; - - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); + SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); + SDValue Ops[] = { Chain, StackSlot }; + + unsigned FLDSize = TheVT.getStoreSize(); + assert(FLDSize <= MemSize && "Stack slot not big enough"); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); - SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); - StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); - - if (UnsignedFixup) { - - // Insert the FIST, load its result as two i32's, - // and XOR the high i32 with Adjust. + // Build the FP_TO_INT*_IN_MEM + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemSize, MemSize); + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, + DAG.getVTList(MVT::Other), + Ops, DstTy, MMO); - SDValue FistOps[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - FistOps, DstTy, MMO); + SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); - SDValue Low32 = - DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo()); - SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); + // If we need an unsigned fixup, XOR the result with adjust. + if (UnsignedFixup) + Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); - SDValue High32 = - DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo()); - High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); - - if (Subtarget.is64Bit()) { - // Join High32 and Low32 into a 64-bit result. - // (High32 << 32) | Low32 - Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); - High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); - High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, - DAG.getConstant(32, DL, MVT::i8)); - SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); - return std::make_pair(Result, SDValue()); - } - - SDValue ResultOps[] = { Low32, High32 }; - - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) - : DAG.getMergeValues(ResultOps, DL); - return std::make_pair(pair, SDValue()); - } else { - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, DstTy, MMO); - return std::make_pair(FIST, StackSlot); - } + return Res; } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - MVT VT = Op->getSimpleValueType(0); - SDValue In = Op->getOperand(0); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); + assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && + "Unexpected extension opcode"); assert(VT.getVectorNumElements() == VT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || @@ -17979,6 +18575,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); + unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); + // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) @@ -17986,8 +18584,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input. - return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In); + return DAG.getNode(ExtendInVecOpc, dl, VT, In); } if (Subtarget.hasInt256()) @@ -18009,11 +18606,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In); + SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); + + // Short-circuit if we can determine that each 128-bit half is the same value. + // Otherwise, this is difficult to match and optimize. + if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In)) + if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); - bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); @@ -18188,8 +18791,11 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); + // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. + SmallVector<int, 64> Mask; + int Scale = 64 / OutVT.getScalarSizeInBits(); + scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask); + Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); @@ -18431,12 +19037,12 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; MVT VT = Op.getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + SDLoc dl(Op); if (VT.isVector()) { - SDValue Src = Op.getOperand(0); - SDLoc dl(Op); - - if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) { + if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; @@ -18456,7 +19062,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); - if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { + if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); @@ -18467,19 +19073,34 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { assert(!VT.isVector()); - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, - IsSigned, /*IsReplace=*/ false); - SDValue FIST = Vals.first, StackSlot = Vals.second; - // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) + bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); + + if (!IsSigned && Subtarget.hasAVX512()) { + // Conversions from f32/f64 should be legal. + if (UseSSEReg) + return Op; + + // Use default expansion. + if (VT == MVT::i64) + return SDValue(); + } + + // Promote i16 to i32 if we can use a SSE operation. + if (VT == MVT::i16 && UseSSEReg) { + assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + // If this is a SINT_TO_FP using SSEReg we're done. + if (UseSSEReg && IsSigned) return Op; - if (StackSlot.getNode()) - // Load the result. - return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); + // Fall back to X87. + if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) + return V; - // The node is the result. - return FIST; + llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { @@ -18500,7 +19121,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize(); + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -18522,16 +19143,11 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, if (!IsFP && !Subtarget.hasSSSE3()) return Op; - // Defer forming the minimal horizontal op if the vector source has more than - // the 2 extract element uses that we're matching here. In that case, we might - // form a horizontal op that includes more than 1 add/sub op. + // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || - !LHS.getOperand(0)->hasNUsesOfValue(2, 0)) - return Op; - - if (!isa<ConstantSDNode>(LHS.getOperand(1)) || + !isa<ConstantSDNode>(LHS.getOperand(1)) || !isa<ConstantSDNode>(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; @@ -18549,33 +19165,37 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); - if (LExtIndex == 1 && RExtIndex == 0 && + if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); - // TODO: This can be extended to handle other adjacent extract pairs. - if (LExtIndex != 0 || RExtIndex != 1) + if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); + unsigned NumLanes = BitWidth / 128; + unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit - // equivalent, so extract the 256/512-bit source op to 128-bit. - // This is free: ymm/zmm -> xmm. + // equivalent, so extract the 256/512-bit source op to 128-bit if we can. SDLoc DL(Op); - if (BitWidth == 256 || BitWidth == 512) - X = extract128BitVector(X, 0, DAG, DL); + if (BitWidth == 256 || BitWidth == 512) { + unsigned LaneIdx = LExtIndex / NumEltsPerLane; + X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); + LExtIndex %= NumEltsPerLane; + } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 + // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, - DAG.getIntPtrConstant(0, DL)); + DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -18741,36 +19361,25 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); } -// Check whether an OR'd tree is PTEST-able. -static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, - const X86Subtarget &Subtarget, - SelectionDAG &DAG, - SDValue &X86CC) { - assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); - - if (!Subtarget.hasSSE41()) - return SDValue(); - - if (!Op->hasOneUse()) - return SDValue(); - - SDNode *N = Op.getNode(); - SDLoc DL(N); - +/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) +/// style scalarized (associative) reduction patterns. +static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, + SmallVectorImpl<SDValue> &SrcOps) { SmallVector<SDValue, 8> Opnds; - DenseMap<SDValue, unsigned> VecInMap; - SmallVector<SDValue, 8> VecIns; + DenseMap<SDValue, APInt> SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. - Opnds.push_back(N->getOperand(0)); - Opnds.push_back(N->getOperand(1)); + assert(Op.getOpcode() == unsigned(BinOp) && + "Unexpected bit reduction opcode"); + Opnds.push_back(Op.getOperand(0)); + Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; - // BFS traverse all OR'd operands. - if (I->getOpcode() == ISD::OR) { + // BFS traverse all BinOp operands. + if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. @@ -18780,42 +19389,63 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + return false; // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa<ConstantSDNode>(Idx)) - return SDValue(); + return false; - SDValue ExtractedFromVec = I->getOperand(0); - DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); - if (M == VecInMap.end()) { - VT = ExtractedFromVec.getValueType(); - // Quit if not 128/256-bit vector. - if (!VT.is128BitVector() && !VT.is256BitVector()) - return SDValue(); + SDValue Src = I->getOperand(0); + DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src); + if (M == SrcOpMap.end()) { + VT = Src.getValueType(); // Quit if not the same type. - if (VecInMap.begin() != VecInMap.end() && - VT != VecInMap.begin()->first.getValueType()) - return SDValue(); - M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; - VecIns.push_back(ExtractedFromVec); + if (SrcOpMap.begin() != SrcOpMap.end() && + VT != SrcOpMap.begin()->first.getValueType()) + return false; + unsigned NumElts = VT.getVectorNumElements(); + APInt EltCount = APInt::getNullValue(NumElts); + M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; + SrcOps.push_back(Src); } - M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + // Quit if element already used. + unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (M->second[CIdx]) + return false; + M->second.setBit(CIdx); } - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Not extracted from 128-/256-bit vector."); + // Quit if not all elements are used. + for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(), + E = SrcOpMap.end(); + I != E; ++I) { + if (!I->second.isAllOnesValue()) + return false; + } - unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + return true; +} - for (DenseMap<SDValue, unsigned>::const_iterator - I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { - // Quit if not all elements are used. - if (I->second != FullMask) - return SDValue(); - } +// Check whether an OR'd tree is PTEST-able. +static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, SDValue &X86CC) { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget.hasSSE41() || !Op->hasOneUse()) + return SDValue(); + + SmallVector<SDValue, 8> VecIns; + if (!matchBitOpReduction(Op, ISD::OR, VecIns)) + return SDValue(); + + // Quit if not 128/256-bit vector. + EVT VT = VecIns[0].getValueType(); + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + SDLoc DL(Op); MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. @@ -18831,10 +19461,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, - DL, MVT::i8); - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, + MVT::i8); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// return true if \c Op has a use that doesn't just read flags. @@ -18972,29 +19601,52 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG, Subtarget); - if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || - Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Only promote the compare up to I32 if it is a 16 bit operation - // with an immediate. 16 bit immediates are to be avoided. - if (Op0.getValueType() == MVT::i16 && - ((isa<ConstantSDNode>(Op0) && - !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) || - (isa<ConstantSDNode>(Op1) && - !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) && - !DAG.getMachineFunction().getFunction().optForMinSize() && - !Subtarget.isAtom()) { + EVT CmpVT = Op0.getValueType(); + + if (CmpVT.isFloatingPoint()) + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + + assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || + CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); + + // Only promote the compare up to I32 if it is a 16 bit operation + // with an immediate. 16 bit immediates are to be avoided. + if (CmpVT == MVT::i16 && !Subtarget.isAtom() && + !DAG.getMachineFunction().getFunction().hasMinSize()) { + ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0); + ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1); + // Don't do this if the immediate can fit in 8-bits. + if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || + (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); - Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { + // For equality comparisons try to use SIGN_EXTEND if the input was + // truncate from something with enough sign bits. + if (Op0.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op0.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } else if (Op1.getOpcode() == ISD::TRUNCATE) { + SDValue In = Op1.getOperand(0); + unsigned EffBits = + In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; + if (EffBits <= 16) + ExtendOp = ISD::SIGN_EXTEND; + } + } + + CmpVT = MVT::i32; + Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); + Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } - // Use SUB instead of CMP to enable CSE between SUB and CMP. - SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return SDValue(Sub.getNode(), 1); } - assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!"); - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); + return Sub.getValue(1); } /// Convert a comparison if required by the subtarget. @@ -19155,7 +19807,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -19299,10 +19951,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } -/// Given a simple buildvector constant, return a new vector constant with each -/// element decremented. If decrementing would result in underflow or this -/// is not a simple vector constant, return an empty value. -static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { +/// Given a buildvector constant, return a new vector constant with each element +/// incremented or decremented. If incrementing or decrementing would result in +/// unsigned overflow or underflow or this is not a simple vector constant, +/// return an empty value. +static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode()); if (!BV) return SDValue(); @@ -19317,11 +19970,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); - // Avoid underflow. - if (Elt->getAPIntValue().isNullValue()) + // Avoid overflow/underflow. + const APInt &EltC = Elt->getAPIntValue(); + if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); - NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT)); + NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); @@ -19353,12 +20007,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = decrementVectorConstant(Op1, DAG); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } + case ISD::SETUGT: { + // If the comparison is against a constant, we can turn this into a setuge. + // This is beneficial because materializing a constant 0 for the PCMPEQ is + // probably cheaper than XOR+PCMPGT using 2 different vector constants: + // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + if (!UGEOp1) + return SDValue(); + Op1 = Op0; + Op0 = UGEOp1; + break; + } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); @@ -19455,10 +20121,6 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); - // Break 256-bit integer vector compare into smaller ones. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntVSETCC(Op, DAG); - // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, @@ -19512,6 +20174,27 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } } + // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. + if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && + Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { + ConstantSDNode *C1 = isConstOrConstSplat(Op1); + if (C1 && C1->getAPIntValue().isPowerOf2()) { + unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; + + SDValue Result = Op0.getOperand(0); + Result = DAG.getNode(ISD::SHL, dl, VT, Result, + DAG.getConstant(ShiftAmt, dl, VT)); + Result = DAG.getNode(ISD::SRA, dl, VT, Result, + DAG.getConstant(BitWidth - 1, dl, VT)); + return Result; + } + } + + // Break 256-bit integer vector compare into smaller ones. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntVSETCC(Op, DAG); + // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. @@ -19539,17 +20222,20 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. - // TODO: This could be extended to handle a non-splat constant by checking - // that each element of the constant is not the max/null value. - APInt C; - if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) { + if (Cond == ISD::SETUGT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isMaxValue(); + })) { // X > C --> X >= (C+1) --> X == umax(X, C+1) - Op1 = DAG.getConstant(C + 1, dl, VT); + Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETUGE; } - if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) { + if (Cond == ISD::SETULT && + ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { + return !C->getAPIntValue().isNullValue(); + })) { // X < C --> X <= (C-1) --> X == umin(X, C-1) - Op1 = DAG.getConstant(C - 1, dl, VT); + Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETULE; } bool Invert = false; @@ -19835,7 +20521,7 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { break; case ISD::UADDO: BaseOp = X86ISD::ADD; - Cond = X86::COND_B; + Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; @@ -19876,6 +20562,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); + assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } @@ -20045,10 +20732,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); + SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Zero = DAG.getConstant(0, DL, Op.getValueType()); - return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); + return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, @@ -20120,7 +20807,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; @@ -20129,7 +20815,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || - Opc == X86ISD::BT) { // FIXME + Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } @@ -20202,8 +20888,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - // Promote i16 cmovs if it won't prevent folding a load. - if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) { + // Or finally, promote i8 cmovs if we have CMOV, + // or i16 cmovs if it won't prevent folding a load. + // FIXME: we should not limit promotion of i8 case to only when the CMOV is + // legal, but EmitLoweredSelect() can not deal with these extensions + // being inserted between two CMOV's. (in i16 case too TBN) + // https://bugs.llvm.org/show_bug.cgi?id=40974 + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || + (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && + !MayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -20462,6 +21155,76 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +/// Change a vector store into a pair of half-size vector stores. +static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert((StoredVal.getValueType().is256BitVector() || + StoredVal.getValueType().is512BitVector()) && + "Expecting 256/512-bit op"); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreVT = StoredVal.getSimpleValueType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; + unsigned HalfAlign = (128 == HalfSize ? 16 : 32); + + SDLoc DL(Store); + SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); + SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); + SDValue Ptr0 = Store->getBasePtr(); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); + unsigned Alignment = Store->getAlignment(); + SDValue Ch0 = + DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), + Alignment, Store->getMemOperand()->getFlags()); + SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, + Store->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Store->getMemOperand()->getFlags()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); +} + +/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar +/// type. +static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, + SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert(StoreVT.is128BitVector() && + StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); + StoredVal = DAG.getBitcast(StoreVT, StoredVal); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreSVT = StoreVT.getScalarType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned ScalarSize = StoreSVT.getStoreSize(); + unsigned Alignment = Store->getAlignment(); + + SDLoc DL(Store); + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Offset = i * ScalarSize; + SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, + DAG.getIntPtrConstant(i, DL)); + SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, + Store->getPointerInfo().getWithOffset(Offset), + MinAlign(Alignment, Offset), + Store->getMemOperand()->getFlags()); + Stores.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); @@ -20491,28 +21254,47 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); + // If this is a 256-bit store of concatenated ops, we are better off splitting + // that store into two 128-bit stores. This avoids spurious use of 256-bit ops + // and each half can execute independently. Some cores would split the op into + // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); + if (StoreVT.is256BitVector()) { + SmallVector<SDValue, 4> CatOps; + if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) + return splitVectorStore(St, DAG); + return SDValue(); + } + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != TargetLowering::TypeWidenVector) return SDValue(); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit element - // and store it. MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); - MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; - MVT CastVT = MVT::getVectorVT(StVT, 2); - StoredVal = DAG.getBitcast(CastVT, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, - DAG.getIntPtrConstant(0, dl)); - return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); + if (Subtarget.hasSSE2()) { + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, + St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we @@ -20703,13 +21485,13 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -21249,42 +22031,41 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector<SDValue, 8> Elts; unsigned NumElts = SrcOp->getNumOperands(); - ConstantSDNode *ND; - switch(Opc) { + switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: - for (unsigned i=0; i!=NumElts; ++i) { + for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } - ND = cast<ConstantSDNode>(CurrentOp); + auto *ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } @@ -21452,7 +22233,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, DAG.getBitcast(MVT::v8i1, Mask), DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND || + Op.getOpcode() == X86ISD::FSETCCM_SAE || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); @@ -21526,11 +22307,31 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { - if (!isa<ConstantSDNode>(Rnd)) - return false; + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; + + return false; + }; + auto isRoundModeSAE = [](SDValue Rnd) { + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) + return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; + + return false; + }; + auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { + RC = C->getZExtValue(); + if (RC & X86::STATIC_ROUNDING::NO_EXC) { + // Clear the NO_EXC bit and check remaining bits. + RC ^= X86::STATIC_ROUNDING::NO_EXC; + return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || + RC == X86::STATIC_ROUNDING::TO_NEG_INF || + RC == X86::STATIC_ROUNDING::TO_POS_INF || + RC == X86::STATIC_ROUNDING::TO_ZERO; + } + } - unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); - return Round == X86::STATIC_ROUNDING::CUR_DIRECTION; + return false; }; SDLoc dl(Op); @@ -21546,13 +22347,29 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(2); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Rnd); - } + Op.getOperand(1), + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); } + case INTR_TYPE_1OP_SAE: { + SDValue Sae = Op.getOperand(2); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); + } case INTR_TYPE_2OP: { SDValue Src2 = Op.getOperand(2); @@ -21562,15 +22379,32 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(3); - if (!isRoundModeCurDirection(Rnd)) { + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), - Op.getOperand(1), Src2, Rnd); - } + Op.getOperand(1), Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Src2); } + case INTR_TYPE_2OP_SAE: { + SDValue Sae = Op.getOperand(3); + + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + } case INTR_TYPE_3OP: case INTR_TYPE_3OP_IMM8: { SDValue Src1 = Op.getOperand(1); @@ -21586,11 +22420,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src1, Src2, Src3, + DAG.getTargetConstant(RC, dl, MVT::i32)); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), @@ -21599,44 +22435,45 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); - case INTR_TYPE_1OP_MASK_RM: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - SDValue RoundingMode; - // We always add rounding mode to the Node. - // If the rounding mode is not specified, we add the - // "current direction" mode. - if (Op.getNumOperands() == 4) - RoundingMode = - DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - else - RoundingMode = Op.getOperand(4); - assert(IntrData->Opc1 == 0 && "Unexpected second opcode!"); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - RoundingMode), - Mask, PassThru, Subtarget, DAG); - } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". + // - RC Opcode is specified and + // - RC is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getVectorMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Src, DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, PassThru, Subtarget, DAG); + if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_1OP_MASK_SAE: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Rnd = Op.getOperand(4); + + unsigned Opc; + if (isRoundModeCurDirection(Rnd)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Rnd)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); + } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21650,10 +22487,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + return getScalarMaskingNode( + DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)), + Mask, passThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Rnd), - Mask, passThru, Subtarget, DAG); + return SDValue(); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), @@ -21663,123 +22504,138 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); + unsigned Opc = IntrData->Opc0; if (HasRounding) { SDValue Sae = Op.getOperand(6); - if (!isRoundModeCurDirection(Sae)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, passThru, Subtarget, DAG); + if (isRoundModeSAE(Sae)) + Opc = IntrWithRoundingModeOpcode; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } - case INTR_TYPE_SCALAR_MASK_RM: { + case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); - SDValue Src0 = Op.getOperand(3); + SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // There are 2 kinds of intrinsics in this group: - // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands - // (2) With rounding mode and sae - 7 operands. - if (Op.getNumOperands() == 6) { - SDValue Sae = Op.getOperand(5); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - Sae), - Mask, Src0, Subtarget, DAG); - } - assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); - SDValue RoundingMode = Op.getOperand(5); - SDValue Sae = Op.getOperand(6); - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - RoundingMode, Sae), - Mask, Src0, Subtarget, DAG); + SDValue Rnd = Op.getOperand(5); + + SDValue NewOp; + unsigned RC = 0; + if (isRoundModeCurDirection(Rnd)) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + else if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else + return SDValue(); + + return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_SAE: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue Sae = Op.getOperand(5); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); + + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { + SDValue NewOp; + if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned RC = 0; + if (isRoundModeSAEToX(Rnd, RC)) + NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, + DAG.getTargetConstant(RC, dl, MVT::i32)); + else if (!isRoundModeCurDirection(Rnd)) + return SDValue(); } - // TODO: Intrinsics should have fast-math-flags to propagate. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), - Mask, PassThru, Subtarget, DAG); + if (!NewOp) + NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK_RM: { + case INTR_TYPE_2OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (6 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 6) - Rnd = Op.getOperand(5); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Rnd), + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_SCALAR_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + unsigned Opc; + if (isRoundModeCurDirection(Sae)) + Opc = IntrData->Opc0; + else if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else + return SDValue(); - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, - Src2, Src3), + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - // We specify 2 possible opcodes for intrinsics with rounding modes. - // First, we check if the intrinsic may have non-default rounding mode, - // (IntrData->Opc1 != 0), then we check the rounding mode operand. - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(6); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src1, Src2, Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3), + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case BLENDV: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + + EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); + Src3 = DAG.getBitcast(MaskVT, Src3); + + // Reverse the operands to match VSELECT order. + return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); + } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -21792,35 +22648,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // first. return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case CVTPD2PS: - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), - DAG.getIntPtrConstant(0, dl)); - case CVTPD2PS_RND_MASK: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - // We add rounding mode to the Node when - // - RM Opcode is specified and - // - RM is not "current direction". - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) { - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), - Src, Rnd), - Mask, PassThru, Subtarget, DAG); - } - } - assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!"); - // ISD::FP_ROUND has a second argument that indicates if the truncation - // does not change the value. Set it to 0 since it can change. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, - DAG.getIntPtrConstant(0, dl)), - Mask, PassThru, Subtarget, DAG); - } case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); @@ -21838,24 +22665,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); - SDValue Cmp; SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(4); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Rnd); + SDValue Sae = Op.getOperand(4); + if (isRoundModeSAE(Sae)) + return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Sae); + if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if (!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); - - return Cmp; } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -21865,12 +22690,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Cmp; if (IntrData->Opc1 != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); + SDValue Sae = Op.getOperand(5); + if (isRoundModeSAE(Sae)) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); } //default rounding mode - if(!Cmp.getNode()) + if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), @@ -21930,9 +22757,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); - else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + else if (isRoundModeSAE(Sae)) + FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); + else + return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, @@ -21949,41 +22778,42 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnesConstant(Mask)) // return data as is + if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is return Op.getOperand(1); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - DataToCompress), - Mask, PassThru, Subtarget, DAG); + // Avoid false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, VT); + + return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, + Mask); } - case FIXUPIMMS: - case FIXUPIMMS_MASKZ: case FIXUPIMM: - case FIXUPIMM_MASKZ:{ + case FIXUPIMM_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? - Src1 : getZeroVector(VT, Subtarget, DAG, dl); - // We specify 2 possible modes for intrinsics, with/without rounding - // modes. - // First, we check if the intrinsic have rounding mode (7 operands), - // if not, we set rounding mode to "current". - SDValue Rnd; - if (Op.getNumOperands() == 7) - Rnd = Op.getOperand(6); - else - Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); - if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); - else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Src3, Imm, Rnd), - Mask, Passthru, Subtarget, DAG); + SDValue Passthru = (IntrData->Type == FIXUPIMM) + ? Src1 + : getZeroVector(VT, Subtarget, DAG, dl); + + unsigned Opc = IntrData->Opc0; + if (IntrData->Opc1 != 0) { + SDValue Sae = Op.getOperand(6); + if (isRoundModeSAE(Sae)) + Opc = IntrData->Opc1; + else if (!isRoundModeCurDirection(Sae)) + return SDValue(); + } + + SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); + + if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) + return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); + + return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); @@ -22027,7 +22857,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: - case CVTPD2I_MASK: + case CVTPD2DQ_MASK: + case CVTQQ2PS_MASK: case TRUNCATE_TO_REG: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -22058,6 +22889,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, PassThru, Mask); } + case CVTNEPS2BF16_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); + + // Break false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); + + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, + Mask); + } default: break; } @@ -22288,10 +23134,37 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); - else // This function handles the SP or FP case. - Reg = RegInfo->getPtrSizedFrameRegister(MF); + else { // Handles the SP or FP case. + bool CantUseFP = RegInfo->needsStackRealignment(MF); + if (CantUseFP) + Reg = RegInfo->getPtrSizedStackRegister(MF); + else + Reg = RegInfo->getPtrSizedFrameRegister(MF); + } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } + + case Intrinsic::x86_avx512_vp2intersect_q_512: + case Intrinsic::x86_avx512_vp2intersect_q_256: + case Intrinsic::x86_avx512_vp2intersect_q_128: + case Intrinsic::x86_avx512_vp2intersect_d_512: + case Intrinsic::x86_avx512_vp2intersect_d_256: + case Intrinsic::x86_avx512_vp2intersect_d_128: { + MVT MaskVT = Op.getSimpleValueType(); + + SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); + SDLoc DL(Op); + + SDValue Operation = + DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, + Op->getOperand(1), Op->getOperand(2)); + + SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, + MaskVT, Operation); + SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, + MaskVT, Operation); + return DAG.getMergeValues({Result0, Result1}, DL); + } } } @@ -22305,25 +23178,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = Mask.getValueType(); + EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } -static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { +static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast<ConstantSDNode>(ScaleOp); @@ -22341,17 +23215,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22364,8 +23239,6 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -22375,10 +23248,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - return SDValue(Res, 1); + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return Res.getValue(1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22401,24 +23277,37 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } -/// Handles the lowering of builtin intrinsic that return the value -/// of the extended control register. -static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl<SDValue> &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; +/// Handles the lowering of builtin intrinsics with chain that return their +/// value into registers EDX:EAX. +/// If operand ScrReg is a valid register identifier, then operand 2 of N is +/// copied to SrcReg. The assumption is that SrcReg is an implicit input to +/// TargetOpcode. +/// Returns a Glue value which can be used to add extra copy-from-reg if the +/// expanded intrinsics implicitly defines extra registers (i.e. not just +/// EDX:EAX). +static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + unsigned TargetOpcode, + unsigned SrcReg, + const X86Subtarget &Subtarget, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Glue; - // The ECX register is used to select the index of the XCR register to - // return. - SDValue Chain = - DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); - SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain); + if (SrcReg) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); + Glue = Chain.getValue(1); + } + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue N1Ops[] = {Chain, Glue}; + SDNode *N1 = DAG.getMachineNode( + TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1)); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. + SDValue LO, HI; if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, @@ -22429,60 +23318,15 @@ static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, LO.getValue(2)); } Chain = HI.getValue(1); + Glue = HI.getValue(2); if (Subtarget.is64Bit()) { - // Merge the two 32-bit values into a 64-bit one.. - SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); - Results.push_back(Chain); - return; - } - - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { LO, HI }; - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); - Results.push_back(Pair); - Results.push_back(Chain); -} - -/// Handles the lowering of builtin intrinsics that read performance monitor -/// counters (x86_rdpmc). -static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SmallVectorImpl<SDValue> &Results) { - assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue LO, HI; - - // The ECX register is used to select the index of the performance counter - // to read. - SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, - N->getOperand(2)); - SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); - - // Reads the content of a 64-bit performance counter and returns it in the - // registers EDX:EAX. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - Chain = HI.getValue(1); - - if (Subtarget.is64Bit()) { - // The EAX register is loaded with the low-order 32 bits. The EDX register - // is loaded with the supported high-order bits of the counter. + // Merge the two 32-bit values into a 64-bit one. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); - return; + return Glue; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. @@ -22490,6 +23334,7 @@ static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); + return Glue; } /// Handles the lowering of builtin intrinsics that read the time stamp counter @@ -22499,59 +23344,28 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl<SDValue> &Results) { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); - SDValue LO, HI; - // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. - if (Subtarget.is64Bit()) { - LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, - LO.getValue(2)); - } else { - LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); - HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, - LO.getValue(2)); - } - SDValue Chain = HI.getValue(1); - - SDValue TSC; - if (Subtarget.is64Bit()) { - // The EDX register is loaded with the high-order 32 bits of the MSR, and - // the EAX register is loaded with the low-order 32 bits. - TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, DL, MVT::i8)); - TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC); - } else { - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI }); - } - - if (Opcode == X86ISD::RDTSCP_DAG) { - assert(N->getNumOperands() == 2 && "Unexpected number of operands!"); - - // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into - // the ECX register. Add 'ecx' explicitly to the chain. - SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, - HI.getValue(2)); - - Results.push_back(TSC); - Results.push_back(ecx); - Results.push_back(ecx.getValue(1)); + SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, + /* NoRegister */0, Subtarget, + Results); + if (Opcode != X86::RDTSCP) return; - } - Results.push_back(TSC); - Results.push_back(Chain); + SDValue Chain = Results[1]; + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); + Results[1] = ecx; + Results.push_back(ecx.getValue(1)); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<SDValue, 3> Results; SDLoc DL(Op); - getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } @@ -22630,6 +23444,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); + case llvm::Intrinsic::x86_rdpkru: { + SDLoc dl(Op); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + // Create a RDPKRU node and pass 0 to the ECX parameter. + return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + } + case llvm::Intrinsic::x86_wrpkru: { + SDLoc dl(Op); + // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 + // to the EDX and ECX parameters. + return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(2), + DAG.getConstant(0, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + } case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: @@ -22639,7 +23469,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later - // during ExpandISelPseudos in EmitInstrWithCustomInserter. + // during FinalizeISel in EmitInstrWithCustomInserter. return SDValue(); } case Intrinsic::x86_lwpins32: @@ -22669,8 +23499,28 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); - SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, + Operation.getValue(1)); + } + case Intrinsic::x86_enqcmd: + case Intrinsic::x86_enqcmds: { + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic!"); + case Intrinsic::x86_enqcmd: + Opcode = X86ISD::ENQCMD; + break; + case Intrinsic::x86_enqcmds: + Opcode = X86ISD::ENQCMDS; + break; + } + SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), + Op.getOperand(3)); + SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } } @@ -22716,7 +23566,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -22752,15 +23602,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. - case RDPMC: { - SmallVector<SDValue, 2> Results; - getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); - return DAG.getMergeValues(Results, dl); - } - // Get Extended Control Register. + case RDPMC: + // GetExtended Control Register. case XGETBV: { SmallVector<SDValue, 2> Results; - getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results); + + // RDPMC uses ECX to select the index of the performance counter to read. + // XGETBV uses ECX to select the index of the XCR register to return. + // The result is stored into registers EDX:EAX. + expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, + Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. @@ -22870,7 +23721,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( - SlotSize, /*Offset=*/0, /*IsImmutable=*/false); + SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); @@ -23453,10 +24304,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); - // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); - assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); @@ -23548,22 +24395,48 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, return split256IntArith(Op, DAG); } -static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); if (VT.getScalarType() == MVT::i1) { SDLoc dl(Op); - switch (Op.getOpcode()) { + switch (Opcode) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: case ISD::SADDSAT: - return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1)); + // *addsat i1 X, Y --> X | Y + return DAG.getNode(ISD::OR, dl, VT, X, Y); case ISD::USUBSAT: case ISD::SSUBSAT: - return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), - DAG.getNOT(dl, Op.getOperand(1), VT)); + // *subsat i1 X, Y --> X & ~Y + return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT)); } } + if (VT.is128BitVector()) { + // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), VT); + SDLoc DL(Op); + if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) { + // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add); + } + if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { + // usubsat X, Y --> (X >u Y) ? X - Y : 0 + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); + return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + } + // Use default expansion. + return SDValue(); + } + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -23895,9 +24768,6 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Signed AVX2 implementation - extend xmm subvectors to ymm. if (VT == MVT::v32i8 && IsSigned) { - SDValue Lo = DAG.getIntPtrConstant(0, dl); - SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl); - MVT ExVT = MVT::v16i16; SDValue ALo = extract128BitVector(A, 0, DAG, dl); SDValue BLo = extract128BitVector(B, 0, DAG, dl); @@ -23907,8 +24777,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); - Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); - Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); @@ -24165,6 +25035,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, APInt APIntShiftAmt; if (!isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); + + // If the shift amount is out of range, return undef. + if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) + return DAG.getUNDEF(VT); + uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -24206,8 +25081,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); + APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); + return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -24233,54 +25108,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -// If V is a splat value, return the source vector and splat index; -static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) { - V = peekThroughEXTRACT_SUBVECTORs(V); - - EVT VT = V.getValueType(); - unsigned Opcode = V.getOpcode(); - switch (Opcode) { - default: { - APInt UndefElts; - APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); - if (DAG.isSplatValue(V, DemandedElts, UndefElts)) { - // Handle case where all demanded elements are UNDEF. - if (DemandedElts.isSubsetOf(UndefElts)) { - SplatIdx = 0; - return DAG.getUNDEF(VT); - } - SplatIdx = (UndefElts & DemandedElts).countTrailingOnes(); - return V; - } - break; - } - case ISD::VECTOR_SHUFFLE: { - // Check if this is a shuffle node doing a splat. - // TODO - remove this and rely purely on SelectionDAG::isSplatValue, - // getTargetVShiftNode currently struggles without the splat source. - auto *SVN = cast<ShuffleVectorSDNode>(V); - if (!SVN->isSplat()) - break; - int Idx = SVN->getSplatIndex(); - int NumElts = V.getValueType().getVectorNumElements(); - SplatIdx = Idx % NumElts; - return V.getOperand(Idx / NumElts); - } - } - - return SDValue(); -} - -static SDValue GetSplatValue(SDValue V, const SDLoc &dl, - SelectionDAG &DAG) { - int SplatIdx; - if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG)) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - SrcVector.getValueType().getScalarType(), SrcVector, - DAG.getIntPtrConstant(SplatIdx, dl)); - return SDValue(); -} - static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -24291,7 +25118,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); - if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) { + if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) { if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { MVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); @@ -25111,24 +25938,45 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b - else if (OpWidth == 128) + return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit(); + if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); - else - return false; + + return false; } +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - return needsCmpXchgNb(SI->getValueOperand()->getType()); + Type *MemType = SI->getValueOperand()->getType(); + + bool NoImplicitFloatOps = + SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return false; + + return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. -// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - auto PTy = cast<PointerType>(LI->getPointerOperandType()); - return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg - : AtomicExpansionKind::None; + Type *MemType = LI->getType(); + + // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we + // can use movq to do the load. If we have X87 we can load into an 80-bit + // X87 register and store it to a stack temporary. + bool NoImplicitFloatOps = + LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && + (Subtarget.hasSSE2() || Subtarget.hasX87())) + return AtomicExpansionKind::None; + + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind @@ -25164,6 +26012,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; @@ -25180,13 +26030,20 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. + // TODO: push more cases through this path. + if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) + return nullptr; + auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); - auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence @@ -25221,14 +26078,80 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. - LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, - AI->getType()->getPrimitiveSizeInBits()); + LoadInst *Loaded = + Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(), + AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } +/// Emit a locked operation on a stack location which does not change any +/// memory location, but does involve a lock prefix. Location is chosen to be +/// a) very likely accessed only by a single thread to minimize cache traffic, +/// and b) definitely dereferenceable. Returns the new Chain result. +static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { + // Implementation notes: + // 1) LOCK prefix creates a full read/write reordering barrier for memory + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) When choosing offsets, there are several contributing factors: + // a) If there's no redzone, we default to TOS. (We could allocate a cache + // line aligned stack object to improve this case.) + // b) To minimize our chances of introducing a false dependence, we prefer + // to offset the stack usage from TOS slightly. + // c) To minimize concerns about cross thread stack usage - in particular, + // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which + // captures state in the TOS frame and accesses it from many threads - + // we want to use an offset such that the offset is in a distinct cache + // line from the TOS frame. + // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + + auto &MF = DAG.getMachineFunction(); + auto &TFL = *Subtarget.getFrameLowering(); + const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; + + if (Subtarget.is64Bit()) { + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::RSP, MVT::i64), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i64), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain}; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); + } + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, + MVT::Other, Ops); + return SDValue(Res, 1); +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -25244,19 +26167,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, dl, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, dl, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. - Zero, - Chain - }; - SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops); - return SDValue(Res, 0); + SDValue Chain = Op.getOperand(0); + return emitLockedStackOp(DAG, Subtarget, Chain, dl); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. @@ -25297,10 +26209,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); - DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); - return SDValue(); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + cpOut, Success, EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. @@ -25712,6 +26622,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -25726,7 +26637,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { - AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); @@ -25736,35 +26646,93 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, return N; } + // Specialized lowering for the canonical form of an idemptotent atomicrmw. + // The core idea here is that since the memory location isn't actually + // changing, all we need is a lowering for the *ordering* impacts of the + // atomicrmw. As such, we can chose a different operation and memory + // location to minimize impact on other code. + if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { + // On X86, the only ordering which actually requires an instruction is + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to + // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be + // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), NewChain); + } + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); - DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); - return SDValue(); + // NOTE: The getUNDEF is needed to give something for the unused result 0. + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), + DAG.getUNDEF(VT), LockOp.getValue(1)); } -static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + auto *Node = cast<AtomicSDNode>(Op.getNode()); SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + EVT VT = Node->getMemoryVT(); + + bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; + bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); + + // If this store is not sequentially consistent and the type is legal + // we can just keep it. + if (!IsSeqCst && IsTypeLegal) + return Op; + + if (VT == MVT::i64 && !IsTypeLegal) { + // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. + // FIXME: Use movlps with SSE1. + // FIXME: Use fist with X87. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, + Ops, MVT::i64, + Node->getMemOperand()); + + // If this is a sequentially consistent store, also emit an appropriate + // barrier. + if (IsSeqCst) + Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); + + return Chain; + } + } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) - // FIXME: On 32-bit, store -> fist or movq would be more efficient - // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast<AtomicSDNode>(Node)->getOrdering() == - AtomicOrdering::SequentiallyConsistent || - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - cast<AtomicSDNode>(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), - cast<AtomicSDNode>(Node)->getMemOperand()); - return Swap.getValue(1); - } - // Other atomic stores have a simple pattern. - return Op; + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + Node->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + Node->getMemOperand()); + return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { @@ -25928,7 +26896,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } return SDValue(); @@ -25944,7 +26911,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } // Custom widen all the operands to avoid promotion. @@ -25989,7 +26955,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } @@ -26000,8 +26965,28 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); + MVT MaskVT = Mask.getSimpleValueType(); + SDValue PassThru = N->getPassThru(); SDLoc dl(Op); + // Handle AVX masked loads which don't support passthru other than 0. + if (MaskVT.getVectorElementType() != MVT::i1) { + // We also allow undef in the isel pattern. + if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) + return Op; + + SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), + N->getBasePtr(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType(), + N->isExpandingLoad()); + // Emit a blend. + SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, + PassThru); + return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); + } + assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); @@ -26020,7 +27005,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG); + PassThru = ExtendToType(PassThru, WideDataVT, DAG); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && @@ -26188,7 +27173,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -26281,7 +27266,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: - case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG); + case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -26310,12 +27295,19 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N, if (!Res.getNode()) return; - assert((N->getNumValues() <= Res->getNumValues()) && + // If the original node has one result, take the return value from + // LowerOperation as is. It might not be result number 0. + if (N->getNumValues() == 1) { + Results.push_back(Res); + return; + } + + // If the original node has multiple results, then the return node should + // have the same number of results. + assert((N->getNumValues() == Res->getNumValues()) && "Lowering returned the wrong number of results!"); // Places new result values base on N result number. - // In some cases (LowerSINT_TO_FP for example) Res has more result values - // than original node, chain should be dropped(last value). for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) Results.push_back(Res.getValue(I)); } @@ -26328,7 +27320,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: +#ifndef NDEBUG + dbgs() << "ReplaceNodeResults: "; + N->dump(&DAG); +#endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::CTPOP: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + // Use a v2i64 if possible. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { + SDValue Wide = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); + Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); + // Bit count should fit in 32-bits, extract it as that and then zero + // extend to i64. Otherwise we end up extracting bits 63:32 separately. + Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); + Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, + DAG.getIntPtrConstant(0, dl)); + Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); + Results.push_back(Wide); + } + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(VT.isVector() && "Unexpected VT"); @@ -26394,6 +27410,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } + case ISD::ABS: { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + assert(N->getValueType(0) == MVT::i64 && + "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDValue Lo, Hi, Tmp; + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + Tmp = DAG.getNode( + ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(HalfT.getSizeInBits() - 1, dl, + TLI.getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + Results.push_back(Lo); + Results.push_back(Hi); + return; + } case ISD::SETCC: { // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when // setCC result type is v2i1 because type legalzation will end up with @@ -26566,14 +27607,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { - if (!ExperimentalVectorWideningLegalization) - return; - EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)) { + (InVT == MVT::v4i16 || InVT == MVT::v4i8) && + getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { + assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting @@ -26598,16 +27638,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) { + if (VT == MVT::v16i32 || VT == MVT::v8i64) { + if (!InVT.is128BitVector()) { + // Not a 128 bit vector, but maybe type legalization will promote + // it to 128 bits. + if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) + return; + InVT = getTypeToTransformTo(*DAG.getContext(), InVT); + if (!InVT.is128BitVector()) + return; + + // Promote the input to 128 bits. Type legalization will turn this into + // zext_inreg/sext_inreg. + In = DAG.getNode(N->getOpcode(), dl, InVT, In); + } + // Perform custom splitting instead of the two stage extend we would get // by default. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); - bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND; - - SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG); + SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); @@ -26617,7 +27669,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); - Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG); + Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); @@ -26744,17 +27796,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - std::pair<SDValue,SDValue> Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); - SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode()) { - // Return a load from the stack slot. - if (StackSlot.getNode()) - Results.push_back( - DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo())); - else - Results.push_back(FIST); - } + if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) + Results.push_back(V); return; } case ISD::SINT_TO_FP: { @@ -26809,31 +27852,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: - return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: - return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); - + expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, + Results); + return; case Intrinsic::x86_xgetbv: - return getExtendedControlRegister(N, dl, DAG, Subtarget, Results); + expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, + Results); + return; } } - case ISD::INTRINSIC_WO_CHAIN: { - if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)) - Results.push_back(V); - return; - } case ISD::READCYCLECOUNTER: { - return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, - Results); + return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; + assert((!Regs64bit || Subtarget.hasCmpxchg16b()) && + "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), @@ -26912,6 +27954,66 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_LOAD: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { + auto *Node = cast<AtomicSDNode>(N); + if (Subtarget.hasSSE2()) { + // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the + // lower 64-bits. + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Node->getMemOperand()); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Ld.getValue(1)); + return; + } + if (Subtarget.hasX87()) { + // First load this into an 80-bit X87 register. This will put the whole + // integer into the significand. + // FIXME: Do we need to glue? See FIXME comment in BuildFILD. + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG, + dl, Tys, Ops, MVT::i64, + Node->getMemOperand()); + SDValue Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // Now store the X87 register to a stack temporary and convert to i64. + // This store is not atomic and doesn't need to be. + // FIXME: We don't need a stack temporary if the result of the load + // is already being stored. We could just directly store there. + SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); + int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag }; + Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, + DAG.getVTList(MVT::Other), StoreOps, + MVT::i64, MPI, 0 /*Align*/, + MachineMemOperand::MOStore); + + // Finally load the value back from the stack temporary and return it. + // This load is not atomic and doesn't need to be. + // This load will be further type legalized. + Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + return; + } + } + // TODO: Use MOVLPS when SSE1 is available? + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by AtomicExpandPass.cpp. + break; + } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: @@ -26923,11 +28025,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; - } + case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); @@ -27070,19 +28171,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); - MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; - SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), - Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + if (Subtarget.hasSSE2()) { + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Ld->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } } @@ -27101,26 +28211,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; - case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; - case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; - case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FIST: return "X86ISD::FIST"; + case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; - case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; - case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; - case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; - case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; + case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; - case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND"; + case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -27149,12 +28255,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAXS: return "X86ISD::FMAXS"; - case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; - case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE"; + case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMINS: return "X86ISD::FMINS"; - case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; - case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; + case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE"; + case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -27186,6 +28292,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -27197,11 +28304,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; - case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; - case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; + case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; + case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; + case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; + case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; @@ -27274,11 +28383,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE"; case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; + case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; - case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND"; + case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE"; case X86ISD::VRANGES: return "X86ISD::VRANGES"; - case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND"; + case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; @@ -27292,6 +28403,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::RDPKRU: return "X86ISD::RDPKRU"; + case X86ISD::WRPKRU: return "X86ISD::WRPKRU"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; @@ -27313,17 +28426,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; - case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND"; + case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; - case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND"; + case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; - case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND"; + case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; - case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND"; + case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; - case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; + case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; - case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; + case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE"; case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -27334,26 +28447,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RCP14: return "X86ISD::RCP14"; case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; + case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE"; case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; + case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS: return "X86ISD::FADDS"; case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS: return "X86ISD::FSUBS"; case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS: return "X86ISD::FMULS"; case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS: return "X86ISD::FDIVS"; case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FSQRTS: return "X86ISD::FSQRTS"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; - case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; - case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; + case X86ISD::FGETEXP: return "X86ISD::FGETEXP"; + case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE"; + case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS"; + case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; + case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; + case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; @@ -27362,23 +28489,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; - case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND"; - case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND"; + case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; + case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE"; case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI"; case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI"; - case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND"; - case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND"; + case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE"; + case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; + case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; + case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; + case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; + case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; - case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; + case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI"; @@ -27389,6 +28520,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; + case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; + case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; + case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; + case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; @@ -27404,6 +28539,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND"; case X86ISD::UMWAIT: return "X86ISD::UMWAIT"; case X86ISD::TPAUSE: return "X86ISD::TPAUSE"; + case X86ISD::ENQCMD: return "X86ISD:ENQCMD"; + case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS"; + case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT"; } return nullptr; } @@ -27489,6 +28627,38 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { return true; } +bool X86TargetLowering::isBinOp(unsigned Opcode) const { + switch (Opcode) { + // These are non-commutative binops. + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::ANDNP: + case X86ISD::PCMPGT: + case X86ISD::FMAX: + case X86ISD::FMIN: + case X86ISD::FANDN: + return true; + } + + return TargetLoweringBase::isBinOp(Opcode); +} + +bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { + switch (Opcode) { + // TODO: Add more X86ISD opcodes once we have test coverage. + case X86ISD::PCMPEQ: + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: + case X86ISD::FMAXC: + case X86ISD::FMINC: + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + return true; + } + + return TargetLoweringBase::isCommutativeBinOp(Opcode); +} + bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -27724,87 +28894,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, return sinkMBB; } -static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert input VAL into EAX - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI.getOperand(0).getReg()); - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert zero to EDX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); - - // insert WRPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - // insert zero to ECX - BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); - - // insert RDPKRU instruction - BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::EAX); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget, - unsigned Opc) { - DebugLoc dl = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX, other two args into ECX, EDX. - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - - unsigned ValOps = X86::AddrNumOperands; - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI.getOperand(ValOps).getReg()); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) - .addReg(MI.getOperand(ValOps + 1).getReg()); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(Opc)); - - MI.eraseFromParent(); // The pseudo is gone now. - return BB; -} - -static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, - const X86Subtarget &Subtarget) { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - // Address into RAX/EAX - unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; - unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI->getOperand(i)); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); - - MI->eraseFromParent(); // The pseudo is gone now. - return BB; -} - MachineBasicBlock * @@ -27834,10 +28923,18 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, unsigned ArgMode = MI.getOperand(7).getImm(); unsigned Align = MI.getOperand(8).getImm(); + MachineFunction *MF = MBB->getParent(); + // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); - SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(), - MI.memoperands_end()); + + MachineMemOperand *OldMMO = MI.memoperands().front(); + + // Clone the MMO into two separate MMOs for loading and storing + MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); + MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( + OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -27902,7 +28999,6 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -27935,7 +29031,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -27944,8 +29040,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise - BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) - .addMBB(overflowMBB); + BuildMI(thisMBB, DL, TII->get(X86::JCC_1)) + .addMBB(overflowMBB).addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. @@ -27960,7 +29056,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 16) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -27988,7 +29084,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -28007,7 +29103,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .add(Index) .addDisp(Disp, 8) .add(Segment) - .setMemRefs(MMOs); + .setMemRefs(LoadOnlyMMO); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -28044,7 +29140,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) - .setMemRefs(MMOs); + .setMemRefs(StoreOnlyMMO); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -28102,7 +29198,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); MBB->addSuccessor(EndMBB); } @@ -28382,13 +29478,11 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); - unsigned Opc = X86::GetCondBranchFromCond(FirstCC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); - unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC); - BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB); + BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] @@ -28474,20 +29568,21 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; - MachineBasicBlock::iterator NextMIIt = - std::next(MachineBasicBlock::iterator(MI)); + MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { - // See if we have a string of CMOVS with the same condition. + // See if we have a string of CMOVS with the same condition. Skip over + // intervening debug insts. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; + NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end()); } } @@ -28519,8 +29614,18 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, SinkMBB->addLiveIn(X86::EFLAGS); } + // Transfer any debug instructions inside the CMOV sequence to the sunk block. + auto DbgEnd = MachineBasicBlock::iterator(LastCMOV); + auto DbgIt = MachineBasicBlock::iterator(MI); + while (DbgIt != DbgEnd) { + auto Next = std::next(DbgIt); + if (DbgIt->isDebugInstr()) + SinkMBB->push_back(DbgIt->removeFromParent()); + DbgIt = Next; + } + // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. - SinkMBB->splice(SinkMBB->begin(), ThisMBB, + SinkMBB->splice(SinkMBB->end(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); @@ -28533,8 +29638,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. - unsigned Opc = X86::GetCondBranchFromCond(CC); - BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] @@ -28551,53 +29655,6 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, - MachineBasicBlock *BB) const { - // Combine the following atomic floating-point modification pattern: - // a.store(reg OP a.load(acquire), release) - // Transform them into: - // OPss (%gpr), %xmm - // movss %xmm, (%gpr) - // Or sd equivalent for 64-bit operations. - unsigned MOp, FOp; - switch (MI.getOpcode()) { - default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); - case X86::RELEASE_FADD32mr: - FOp = X86::ADDSSrm; - MOp = X86::MOVSSmr; - break; - case X86::RELEASE_FADD64mr: - FOp = X86::ADDSDrm; - MOp = X86::MOVSDmr; - break; - } - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned ValOpIdx = X86::AddrNumOperands; - unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(FOp), - MRI.createVirtualRegister(MRI.getRegClass(VSrc))) - .addReg(VSrc); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand &Operand = MI.getOperand(i); - // Clear any kill flags on register operands as we'll create a second - // instruction using the same address operands. - if (Operand.isReg()) - Operand.setIsKill(false); - MIB.add(Operand); - } - MachineInstr *FOpMI = MIB; - MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.add(MI.getOperand(i)); - MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); - MI.eraseFromParent(); // The pseudo instruction is gone now. - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -28663,7 +29720,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -29290,7 +30347,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(checkSspMBB, DL, TII->get(TestRROpc)) .addReg(SSPCopyReg) .addReg(SSPCopyReg); - BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); checkSspMBB->addSuccessor(sinkMBB); checkSspMBB->addSuccessor(fallMBB); @@ -29320,7 +30377,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addReg(SSPCopyReg); // Jump to sink in case PrevSSPReg <= SSPCopyReg. - BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB); + BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE); fallMBB->addSuccessor(sinkMBB); fallMBB->addSuccessor(fixShadowMBB); @@ -29343,7 +30400,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addImm(8); // Jump if the result of the shift is zero. - BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB); + BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); fixShadowMBB->addSuccessor(sinkMBB); fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); @@ -29378,7 +30435,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg); // Jump if the counter is not zero yet. - BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB); + BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE); fixShadowLoopMBB->addSuccessor(sinkMBB); fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); @@ -29523,10 +30580,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - int FI = MFI.getFunctionContextIndex(); + int FI = MF->getFrameInfo().getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -29624,7 +30680,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); - BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB); + BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); @@ -29777,7 +30833,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: + case X86::CMOV_FR32X: case X86::CMOV_FR64: + case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -29832,10 +30890,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } - case X86::RELEASE_FADD32mr: - case X86::RELEASE_FADD64mr: - return EmitLoweredAtomicFP(MI, BB); - case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -29847,27 +30901,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FNSTCW16m)), CWFrameIdx); + TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); - // Load the old value of the high byte of the control word... + // Load the old value of the control word... unsigned OldCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), + OrigCWFrameIdx); + + // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. + unsigned NewCW = + MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) + .addReg(OldCW, RegState::Kill).addImm(0xC00); + + // Extract to 16 bits. + unsigned NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), - CWFrameIdx); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) + .addReg(NewCW, RegState::Kill, X86::sub_16bit); - // Set the high part to be round to zero... - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) - .addImm(0xC7F); + // Prepare memory for FLDCW. + int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), + NewCWFrameIdx) + .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); - - // Restore the memory image of control word to original value - addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) - .addReg(OldCW); + TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; @@ -29890,26 +30954,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, - TII->get(X86::FLDCW16m)), CWFrameIdx); + TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // Thread synchronization. - case X86::MONITOR: - return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); - case X86::MONITORX: - return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); - - // Cache line zero - case X86::CLZERO: - return emitClzero(&MI, BB, Subtarget); - - // PKU feature - case X86::WRPKRU: - return emitWRPKRU(MI, BB, Subtarget); - case X86::RDPKRU: - return emitRDPKRU(MI, BB, Subtarget); + // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); @@ -30104,7 +31154,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); - Known = Known.zextOrTrunc(BitWidth); + Known = Known.zextOrTrunc(BitWidth, false); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } @@ -30161,6 +31211,27 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.trunc(BitWidth); break; } + case X86ISD::ANDNP: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // ANDNP = (~X & Y); + Known.One &= Known2.Zero; + Known.Zero |= Known2.One; + break; + } + case X86ISD::FOR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // Output known-0 bits are only known if clear in both the LHS & RHS. + Known.Zero &= Known2.Zero; + // Output known-1 are known to be set if set in either the LHS | RHS. + Known.One |= Known2.One; + break; + } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); // If we don't know any bits, early out. @@ -30230,7 +31301,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned VTBits = Op.getScalarValueSizeInBits(); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: @@ -30268,7 +31340,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + const APInt &ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30279,7 +31351,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); - APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + APInt ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); @@ -30295,6 +31367,15 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( // Vector compares return zero/all-bits result values. return VTBits; + case X86ISD::ANDNP: { + unsigned Tmp0 = + DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Tmp0 == 1) return 1; // Early out. + unsigned Tmp1 = + DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); + return std::min(Tmp0, Tmp1); + } + case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. @@ -30303,6 +31384,54 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( } } + // Handle target shuffles. + // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. + if (isTargetShuffle(Opcode)) { + bool IsUnary; + SmallVector<int, 64> Mask; + SmallVector<SDValue, 2> Ops; + if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, + IsUnary)) { + unsigned NumOps = Ops.size(); + unsigned NumElts = VT.getVectorNumElements(); + if (Mask.size() == NumElts) { + SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0)); + for (unsigned i = 0; i != NumElts; ++i) { + if (!DemandedElts[i]) + continue; + int M = Mask[i]; + if (M == SM_SentinelUndef) { + // For UNDEF elements, we don't know anything about the common state + // of the shuffle result. + return 1; + } else if (M == SM_SentinelZero) { + // Zero = all sign bits. + continue; + } + assert(0 <= M && (unsigned)M < (NumOps * NumElts) && + "Shuffle index out of range"); + + unsigned OpIdx = (unsigned)M / NumElts; + unsigned EltIdx = (unsigned)M % NumElts; + if (Ops[OpIdx].getValueType() != VT) { + // TODO - handle target shuffle ops with different value types. + return 1; + } + DemandedOps[OpIdx].setBit(EltIdx); + } + unsigned Tmp0 = VTBits; + for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { + if (!DemandedOps[i]) + continue; + unsigned Tmp1 = + DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); + Tmp0 = std::min(Tmp0, Tmp1); + } + return Tmp0; + } + } + } + // Fallback case. return 1; } @@ -30316,12 +31445,11 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const { // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { +static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, unsigned &Shuffle, + MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -30333,19 +31461,25 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, return true; } - // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { - bool Match = true; + bool MatchAny = true; + bool MatchZero = true; unsigned NumDstElts = NumMaskElts / Scale; - for (unsigned i = 0; i != NumDstElts && Match; ++i) { - Match &= isUndefOrEqual(Mask[i * Scale], (int)i); - Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); + for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) { + if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { + MatchAny = MatchZero = false; + break; + } + MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1); + MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } - if (Match) { + if (MatchAny || MatchZero) { + assert(MatchZero && "Failed to match zext but matched aext?"); unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); @@ -30354,10 +31488,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - if (SrcVT.getVectorNumElements() == NumDstElts) - Shuffle = unsigned(ISD::ZERO_EXTEND); - else - Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); + Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); + if (SrcVT.getVectorNumElements() != NumDstElts) + Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); @@ -30379,7 +31512,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { + if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -30437,29 +31570,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - // Attempt to match against broadcast-from-vector. - if (Subtarget.hasAVX2()) { - SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); - if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - SrcVT = DstVT = MaskVT; - Shuffle = X86ISD::VBROADCAST; - return true; - } - } - return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - const APInt &Zeroable, - bool AllowFloatDomain, - bool AllowIntDomain, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT, - unsigned &PermuteImm) { +static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, + bool AllowFloatDomain, bool AllowIntDomain, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; @@ -30560,9 +31682,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // FIXME: Add 512-bit support. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskScalarSizeInBits, Mask, - 0, Zeroable, Subtarget); + int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, + Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; return true; @@ -30575,13 +31696,12 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, SDValue &V2, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, - bool IsUnary) { +static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, + bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -30642,7 +31762,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, return false; } -static bool matchBinaryPermuteVectorShuffle( +static bool matchBinaryPermuteShuffle( MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, @@ -30653,7 +31773,7 @@ static bool matchBinaryPermuteVectorShuffle( // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); + int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); @@ -30689,34 +31809,11 @@ static bool matchBinaryPermuteVectorShuffle( return true; } } else { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = - scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); - } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; return true; } } @@ -30726,7 +31823,7 @@ static bool matchBinaryPermuteVectorShuffle( if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { if (Zeroable.getBoolValue() && - matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; @@ -30738,7 +31835,7 @@ static bool matchBinaryPermuteVectorShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; @@ -30795,6 +31892,11 @@ static bool matchBinaryPermuteVectorShuffle( return false; } +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget); + /// Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// @@ -30852,6 +31954,24 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + // Attempt to match a subvector broadcast. + // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) + if (UnaryShuffle && + (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) { + SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0); + if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) { + SDValue Src = Inputs[0]; + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(0).isUndef() && + Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits && + MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) { + return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL, + Src.getValueType(), + Src.getOperand(1))); + } + } + } + // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. @@ -30905,6 +32025,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. + // TODO: Should we indicate which domain is preferred if both are allowed? bool AllowFloatDomain = FloatDomain || (Depth > 3); bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); @@ -30920,8 +32041,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && - (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() % + MaskEltSizeInBits) == 0) { + unsigned Scale = + cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() / + MaskEltSizeInBits; ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { @@ -30929,10 +32053,35 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + // Attempt to match against broadcast-from-vector. + // Limit AVX1 to cases where we're loading+broadcasting a scalar element. + if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) + && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); + if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { + if (V1.getValueType() == MaskVT && + V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + MayFoldLoad(V1.getOperand(0))) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = V1.getOperand(0); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + if (Subtarget.hasAVX2()) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = DAG.getBitcast(MaskVT, V1); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + } + } + SDValue NewV1 = V1; // Save operand in case early exit happens. - if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT) && + if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30941,9 +32090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, Subtarget, Shuffle, - ShuffleVT, PermuteImm) && + if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, ShuffleVT, + PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30956,9 +32105,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; - if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - NewV1, NewV2, DL, DAG, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT, UnaryShuffle) && + if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, + NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! @@ -30970,7 +32119,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; - if (matchBinaryPermuteVectorShuffle( + if (matchBinaryPermuteShuffle( MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { @@ -30990,8 +32139,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; - if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, - Zeroable)) { + if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -31001,7 +32150,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); @@ -31068,6 +32217,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && @@ -31233,10 +32389,145 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and either input is extracted then try to combine as a + // shuffle with the larger type. + if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( + Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, + DAG, Subtarget)) + return WideShuffle; + + // If we have a dual input shuffle then lower to VPERMV3. + if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 || + MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) { + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); + V1 = DAG.getBitcast(MaskVT, V1); + V2 = DAG.getBitcast(MaskVT, V2); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); + return DAG.getBitcast(RootVT, Res); + } + // Failed to find any combines. return SDValue(); } +// Combine an arbitrary chain of shuffles + extract_subvectors into a single +// instruction if possible. +// +// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger +// type size to attempt to combine: +// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) +// --> +// extract_subvector(shuffle(x,y,m2),0) +static SDValue combineX86ShuffleChainWithExtract( + ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned NumMaskElts = BaseMask.size(); + unsigned NumInputs = Inputs.size(); + if (NumInputs == 0) + return SDValue(); + + SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end()); + SmallVector<unsigned, 4> Offsets(NumInputs, 0); + + // Peek through subvectors. + // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs? + unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits(); + for (unsigned i = 0; i != NumInputs; ++i) { + SDValue &Src = WideInputs[i]; + unsigned &Offset = Offsets[i]; + Src = peekThroughBitcasts(Src); + EVT BaseVT = Src.getValueType(); + while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isa<ConstantSDNode>(Src.getOperand(1))) { + Offset += Src.getConstantOperandVal(1); + Src = Src.getOperand(0); + } + WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); + assert((Offset % BaseVT.getVectorNumElements()) == 0 && + "Unexpected subvector extraction"); + Offset /= BaseVT.getVectorNumElements(); + Offset *= NumMaskElts; + } + + // Bail if we're always extracting from the lowest subvectors, + // combineX86ShuffleChain should match this for the current width. + if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; })) + return SDValue(); + + EVT RootVT = Root.getValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned Scale = WideSizeInBits / RootSizeInBits; + assert((WideSizeInBits % RootSizeInBits) == 0 && + "Unexpected subvector extraction"); + + // If the src vector types aren't the same, see if we can extend + // them to match each other. + // TODO: Support different scalar types? + EVT WideSVT = WideInputs[0].getValueType().getScalarType(); + if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) { + return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) || + Op.getValueType().getScalarType() != WideSVT; + })) + return SDValue(); + + for (SDValue &NewInput : WideInputs) { + assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && + "Shuffle vector size mismatch"); + if (WideSizeInBits > NewInput.getValueSizeInBits()) + NewInput = widenSubVector(NewInput, false, Subtarget, DAG, + SDLoc(NewInput), WideSizeInBits); + assert(WideSizeInBits == NewInput.getValueSizeInBits() && + "Unexpected subvector extraction"); + } + + // Create new mask for larger type. + for (unsigned i = 1; i != NumInputs; ++i) + Offsets[i] += i * Scale * NumMaskElts; + + SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end()); + for (int &M : WideMask) { + if (M < 0) + continue; + M = (M % NumMaskElts) + Offsets[M / NumMaskElts]; + } + WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); + + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(WideInputs, WideMask); + assert(!WideInputs.empty() && "Shuffle with no inputs detected"); + + if (WideInputs.size() > 2) + return SDValue(); + + // Increase depth for every upper subvector we've peeked through. + Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; }); + + // Attempt to combine wider chain. + // TODO: Can we use a better Root? + SDValue WideRoot = WideInputs[0]; + if (SDValue WideShuffle = combineX86ShuffleChain( + WideInputs, WideRoot, WideMask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget)) { + WideShuffle = + extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); + return DAG.getBitcast(RootVT, WideShuffle); + } + return SDValue(); +} + // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. @@ -31381,19 +32672,10 @@ static SDValue combineX86ShufflesRecursively( if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); - // TODO - Add support for more than 2 inputs. - if (2 < OpInputs.size()) - return SDValue(); - - SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); - SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); - // Add the inputs to the Ops list, avoiding duplicates. SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { - if (!Input) - return -1; // Attempt to find an existing match. SDValue InputBC = peekThroughBitcasts(Input); for (int i = 0, e = Ops.size(); i < e; ++i) @@ -31409,8 +32691,9 @@ static SDValue combineX86ShufflesRecursively( return Ops.size() - 1; }; - int InputIdx0 = AddOp(Input0, SrcOpIndex); - int InputIdx1 = AddOp(Input1, -1); + SmallVector<int, 2> OpInputIdx; + for (SDValue OpInput : OpInputs) + OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || @@ -31482,13 +32765,9 @@ static SDValue combineX86ShufflesRecursively( : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); - if (OpMask[OpIdx] < (int)OpMask.size()) { - assert(0 <= InputIdx0 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx0 * MaskWidth; - } else { - assert(0 <= InputIdx1 && "Unknown target shuffle input"); - OpMaskedIdx += InputIdx1 * MaskWidth; - } + int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); + assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); + OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; Mask[i] = OpMaskedIdx; } @@ -31504,7 +32783,7 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused shuffle source ops. + // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); @@ -31541,29 +32820,42 @@ static SDValue combineX86ShufflesRecursively( return Cst; // We can only combine unary and binary shuffle mask cases. - if (Ops.size() > 2) - return SDValue(); + if (Ops.size() <= 2) { + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with sequential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 64> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + } + + // Canonicalization of binary shuffle masks to improve pattern matching by + // commuting the inputs. + if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + } - // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with sequential pairs of - // elements, and shrink them to the half-width mask. It does this in a loop - // so it will reduce the size of the mask to the minimal width mask which - // performs an equivalent shuffle. - SmallVector<int, 64> WidenedMask; - while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { - Mask = std::move(WidenedMask); + // Finally, try to combine into a single shuffle instruction. + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget); } - // Canonicalization of binary shuffle masks to improve pattern matching by - // commuting the inputs. - if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { - ShuffleVectorSDNode::commuteMask(Mask); - std::swap(Ops[0], Ops[1]); - } + // If that failed and any input is extracted then try to combine as a + // shuffle with the larger type. + return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, + HasVariableMask, AllowVariableMask, + DAG, Subtarget); +} - // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget); +/// Helper entry wrapper to combineX86ShufflesRecursively. +static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, + /*AllowVarMask*/ true, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. @@ -31781,12 +33073,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, switch (Opcode) { case X86ISD::VBROADCAST: { - // If broadcasting from another shuffle, attempt to simplify it. - // TODO - we really need a general SimplifyDemandedVectorElts mechanism. SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); EVT SrcVT = Src.getValueType(); EVT BCVT = BC.getValueType(); + + // If broadcasting from another shuffle, attempt to simplify it. + // TODO - we really need a general SimplifyDemandedVectorElts mechanism. if (isTargetShuffle(BC.getOpcode()) && VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); @@ -31800,6 +33093,71 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } + + // broadcast(bitcast(src)) -> bitcast(broadcast(src)) + // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. + if (Src.getOpcode() == ISD::BITCAST && + SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) { + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), + VT.getVectorNumElements()); + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); + } + + // Reduce broadcast source vector to lowest 128-bits. + if (SrcVT.getSizeInBits() > 128) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + extract128BitVector(Src, 0, DAG, DL)); + + // broadcast(scalar_to_vector(x)) -> broadcast(x). + if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + + // Share broadcast with the longest vector and extract low subvector (free). + for (SDNode *User : Src->uses()) + if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + return extractSubVector(SDValue(User, 0), 0, DAG, DL, + VT.getSizeInBits()); + } + + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && + SrcVT.getScalarSizeInBits() >= 32) { + unsigned Mask = N.getConstantOperandVal(2); + unsigned Size = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant(ScaleMask, DL, MVT::i8))); + } + } + return SDValue(); + } + case X86ISD::VPERMI: { + // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. + // TODO: Remove when we have preferred domains in combineX86ShuffleChain. + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); + return DAG.getBitcast(VT, Res); + } return SDValue(); } case X86ISD::PSHUFD: @@ -32223,8 +33581,22 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N) { - if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) - return SDValue(); + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) + if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); + + // For a broadcast, peek through an extract element of index 0 to find the + // horizontal op: broadcast (ext_vec_elt HOp, 0) + EVT VT = N->getValueType(0); + if (Opcode == X86ISD::VBROADCAST) { + SDValue SrcOp = N->getOperand(0); + if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + SrcOp.getValueType() == MVT::f64 && + SrcOp.getOperand(0).getValueType() == VT && + isNullConstant(SrcOp.getOperand(1))) + N = SrcOp.getNode(); + } SDValue HOp = N->getOperand(0); if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && @@ -32235,13 +33607,25 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] // ...similarly for v2f64 and v8i16. - // TODO: Handle UNDEF operands. - if (HOp.getOperand(0) != HOp.getOperand(1)) + if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() && + HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); // When the operands of a horizontal math op are identical, the low half of - // the result is the same as the high half. If the shuffle is also replicating - // low and high halves, we don't need the shuffle. + // the result is the same as the high half. If a target shuffle is also + // replicating low and high halves, we don't need the shuffle. + if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { + if (HOp.getScalarValueSizeInBits() == 64) { + // movddup (hadd X, X) --> hadd X, X + // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X + assert((HOp.getValueType() == MVT::v2f64 || + HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && + "Unexpected type for h-op"); + return HOp; + } + return SDValue(); + } + // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, @@ -32263,14 +33647,51 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { return SDValue(); } +/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the +/// low half of each source vector and does not set any high half elements in +/// the destination vector, narrow the shuffle to half its original size. +static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { + if (!Shuf->getValueType(0).isSimple()) + return SDValue(); + MVT VT = Shuf->getSimpleValueType(0); + if (!VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // See if we can ignore all of the high elements of the shuffle. + ArrayRef<int> Mask = Shuf->getMask(); + if (!isUndefUpperHalf(Mask)) + return SDValue(); + + // Check if the shuffle mask accesses only the low half of each input vector + // (half-index output is 0 or 2). + int HalfIdx1, HalfIdx2; + SmallVector<int, 8> HalfMask(Mask.size() / 2); + if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || + (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) + return SDValue(); + + // Create a half-width shuffle to replace the unnecessarily wide shuffle. + // The trick is knowing that all of the insert/extract are actually free + // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle + // of narrow inputs into a narrow output, and that is always cheaper than + // the wide shuffle that we started with. + return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), + Shuf->getOperand(1), HalfMask, HalfIdx1, + HalfIdx2, false, DAG); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N)) + if (SDValue V = narrowShuffle(Shuf, DAG)) + return V; + + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; @@ -32339,23 +33760,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, - // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are - // consecutive, non-overlapping, and in the right order. - SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { - Elts.push_back(Elt); - continue; - } - Elts.clear(); - break; - } - - if (Elts.size() == VT.getVectorNumElements()) - if (SDValue LD = - EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) - return LD; + // Attempt to combine into a vector load/broadcast. + if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle <mask> (concat_vectors t1, undef) @@ -32376,9 +33783,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // Simplify source operands based on shuffle mask. @@ -32389,6 +33794,68 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros + // in the upper 64 bits. + // TODO: Can we generalize this using computeKnownBits. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && + (VT == MVT::v2f64 || VT == MVT::v2i64) && + N->getOperand(0).getOpcode() == ISD::BITCAST && + (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || + N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { + SDValue In = N->getOperand(0).getOperand(0); + switch (In.getOpcode()) { + default: + break; + case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: + case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: + case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: + case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: + case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: + case X86ISD::VFPROUND: case X86ISD::VMFPROUND: + if (In.getOperand(0).getValueType() == MVT::v2f64 || + In.getOperand(0).getValueType() == MVT::v2i64) + return N->getOperand(0); // return the bitcast + break; + } + } + + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && + N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && + N->getOperand(0).hasOneUse() && + N->getOperand(0).getOperand(0).isUndef() && + isNullConstant(N->getOperand(0).getOperand(2))) { + SDValue In = N->getOperand(0).getOperand(1); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), + Movl, N->getOperand(0).getOperand(2)); + } + + // If this a vzmovl of a full vector load, replace it with a vzload, unless + // the load is volatile. + if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && + ISD::isNormalLoad(N->getOperand(0).getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + if (!LN->isVolatile()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + VT.getVectorElementType(), + LN->getPointerInfo(), + LN->getAlignment(), + MachineMemOperand::MOLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return VZLoad; + } + } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. @@ -32447,6 +33914,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Handle special case opcodes. switch (Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + Depth + 1)) + return true; + // Multiply by zero. + KnownZero = LHSZero | RHSZero; + break; + } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { @@ -32454,11 +33937,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); + + // If we reuse the shift amount just for sse shift amounts then we know that + // only the bottom 64-bits are only ever used. + bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { + unsigned UseOpc = Use->getOpcode(); + return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || + UseOpc == X86ISD::VSRA) && + Use->getOperand(0) != Amt; + }); + APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, - Depth + 1)) + Depth + 1, AssumeSingleUse)) return true; LLVM_FALLTHROUGH; } @@ -32498,6 +33991,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + APInt DemandedLHS, DemandedRHS; + getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + break; + } + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, + Depth + 1)) + return true; + KnownZero = SrcZero.zextOrTrunc(NumElts); + KnownUndef = SrcUndef.zextOrTrunc(NumElts); + break; + } + case X86ISD::BLENDV: { + APInt SelUndef, SelZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, + SelZero, TLO, Depth + 1)) + return true; + + // TODO: Use SelZero to adjust LHS/RHS DemandedElts. + APInt LHSUndef, LHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, + LHSZero, TLO, Depth + 1)) + return true; + + APInt RHSUndef, RHSZero; + if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, + RHSZero, TLO, Depth + 1)) + return true; + + KnownZero = LHSZero & RHSZero; + KnownUndef = LHSUndef & RHSUndef; + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32505,7 +34050,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return false; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { - if(Src.getValueType() != VT) + if (Src.getValueType() != VT) Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, SDLoc(Op)); return TLO.CombineTo(Op, Src); @@ -32517,8 +34062,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } - case X86ISD::PSHUFB: { - // TODO - simplify other variable shuffle masks. + case X86ISD::SUBV_BROADCAST: { + // Reduce size of broadcast if we don't need the upper half. + unsigned HalfElts = NumElts / 2; + if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + SDValue Half = Src; + if (SrcVT.getVectorNumElements() != HalfElts) { + MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts); + Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src); + } + + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0, + TLO.DAG, SDLoc(Op), + Half.getValueSizeInBits())); + } + break; + } + case X86ISD::VPERMV: { + SDValue Mask = Op.getOperand(0); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + case X86ISD::PSHUFB: + case X86ISD::VPERMV3: + case X86ISD::VPERMILPV: { SDValue Mask = Op.getOperand(1); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, @@ -32526,6 +34099,106 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } + case X86ISD::VPPERM: + case X86ISD::VPERMIL2: { + SDValue Mask = Op.getOperand(2); + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + break; + } + } + + // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not + // demand any of the high elements, then narrow the op to 128/256-bits: e.g. + // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 + if ((VT.is256BitVector() || VT.is512BitVector()) && + DemandedElts.lshr(NumElts / 2) == 0) { + unsigned SizeInBits = VT.getSizeInBits(); + unsigned ExtSizeInBits = SizeInBits / 2; + + // See if 512-bit ops only use the bottom 128-bits. + if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) + ExtSizeInBits = SizeInBits / 4; + + switch (Opc) { + // Zero upper elements. + case X86ISD::VZEXT_MOVL: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + // Byte shifts by immediate. + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: + // Shift by uniform. + case X86ISD::VSHL: + case X86ISD::VSRL: + case X86ISD::VSRA: + // Shift by immediate. + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: { + SDLoc DL(Op); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = + TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + case X86ISD::VPERMI: { + // Simplify PERMPD/PERMQ to extract_subvector. + // TODO: This should be done in shuffle combining. + if (VT == MVT::v4f64 || VT == MVT::v4i64) { + SmallVector<int, 4> Mask; + DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); + if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { + SDLoc DL(Op); + SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); + return TLO.CombineTo(Op, Insert); + } + } + break; + } + // Target Shuffles. + case X86ISD::PSHUFB: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // Saturated Packs. + case X86ISD::PACKSS: + case X86ISD::PACKUS: + // Horizontal Ops. + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + SDLoc DL(Op); + MVT ExtVT = VT.getSimpleVT(); + ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), + ExtSizeInBits / ExtVT.getScalarSizeInBits()); + SDValue Ext0 = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue Ext1 = + extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits); + SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1); + SDValue UndefVec = TLO.DAG.getUNDEF(VT); + SDValue Insert = + insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); + return TLO.CombineTo(Op, Insert); + } + } } // Simplify target shuffles. @@ -32617,9 +34290,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue RHS = Op.getOperand(1); // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); - if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, + TLO, Depth + 1)) return true; break; } @@ -32738,6 +34413,97 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } break; } + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: { + SDValue Vec = Op.getOperand(0); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + MVT VecVT = Vec.getSimpleValueType(); + unsigned NumVecElts = VecVT.getVectorNumElements(); + + if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { + unsigned Idx = CIdx->getZExtValue(); + unsigned VecBitWidth = VecVT.getScalarSizeInBits(); + + // If we demand no bits from the vector then we must have demanded + // bits from the implict zext - simplify to zero. + APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); + if (DemandedVecBits == 0) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + + APInt KnownUndef, KnownZero; + APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, + KnownZero, TLO, Depth + 1)) + return true; + + KnownBits KnownVec; + if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + Known = KnownVec.zext(BitWidth, true); + return false; + } + break; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue Vec = Op.getOperand(0); + SDValue Scl = Op.getOperand(1); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { + unsigned Idx = CIdx->getZExtValue(); + if (!OriginalDemandedElts[Idx]) + return TLO.CombineTo(Op, Vec); + + KnownBits KnownVec; + APInt DemandedVecElts(OriginalDemandedElts); + DemandedVecElts.clearBit(Idx); + if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + KnownBits KnownScl; + unsigned NumSclBits = Scl.getScalarValueSizeInBits(); + APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); + if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) + return true; + + KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); + Known.One = KnownVec.One & KnownScl.One; + Known.Zero = KnownVec.Zero & KnownScl.Zero; + return false; + } + break; + } + case X86ISD::PACKSS: + // PACKSS saturates to MIN/MAX integer values. So if we just want the + // sign bit then we can just ask for the source operands sign bit. + // TODO - add known bits handling. + if (OriginalDemandedBits.isSignMask()) { + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); + + KnownBits KnownLHS, KnownRHS; + APInt SignMask = APInt::getSignMask(BitWidth * 2); + if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, + KnownLHS, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, + KnownRHS, TLO, Depth + 1)) + return true; + } + // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. + break; + case X86ISD::PCMPGT: + // icmp sgt(0, R) == ashr(R, BitWidth-1). + // iff we only need the sign bit then we can use R directly. + if (OriginalDemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return TLO.CombineTo(Op, Op.getOperand(1)); + break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -32879,29 +34645,42 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Helper to peek through bitops/setcc to determine size of source vector. +// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. +static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return Src.getOperand(0).getValueSizeInBits() == Size; + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return checkBitcastSrcVectorSize(Src.getOperand(0), Size) && + checkBitcastSrcVectorSize(Src.getOperand(1), Size); + } + return false; +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. -static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, +static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, + const SDLoc &DL, const X86Subtarget &Subtarget) { - EVT VT = BitCast.getValueType(); - SDValue N0 = BitCast.getOperand(0); - EVT VecVT = N0->getValueType(0); - - if (!VT.isScalarInteger() || !VecVT.isSimple()) + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. - bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - (N0.getOperand(0).getValueType() == MVT::v16i8 || - N0.getOperand(0).getValueType() == MVT::v32i8 || - N0.getOperand(0).getValueType() == MVT::v64i8); + bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + (Src.getOperand(0).getValueType() == MVT::v16i8 || + Src.getOperand(0).getValueType() == MVT::v32i8 || + Src.getOperand(0).getValueType() == MVT::v64i8); // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. @@ -32919,7 +34698,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; - switch (VecVT.getSimpleVT().SimpleTy) { + switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: @@ -32929,10 +34708,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - N0->getOperand(0).getValueType().is256BitVector()) { + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) SExtVT = MVT::v4i64; - } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -32941,9 +34718,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (N0->getOperand(0).getValueType().is256BitVector() || - N0->getOperand(0).getValueType().is512BitVector())) { + // TODO : use checkBitcastSrcVectorSize + if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && + (Src.getOperand(0).getValueType().is256BitVector() || + Src.getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; } break; @@ -32967,8 +34745,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, return SDValue(); }; - SDLoc DL(BitCast); - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0); + SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v64i8) { SDValue Lo, Hi; @@ -32988,7 +34765,11 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, DAG.getUNDEF(MVT::v8i16)); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } - return DAG.getZExtOrTrunc(V, DL, VT); + + EVT IntVT = + EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); + V = DAG.getZExtOrTrunc(V, DL, IntVT); + return DAG.getBitcast(VT, V); } // Convert a vXi1 constant build vector to the same width scalar integer. @@ -33065,12 +34846,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, +static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDLoc DL(N); - unsigned NumElts = N.getNumOperands(); - - auto *BV = cast<BuildVectorSDNode>(N); + SDLoc DL(BV); + unsigned NumElts = BV->getNumOperands(); SDValue Splat = BV->getSplatValue(); // Build MMX element from integer GPR or SSE float values. @@ -33118,7 +34897,7 @@ static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG, Ops.append(NumElts, Splat); } else { for (unsigned i = 0; i != NumElts; ++i) - Ops.push_back(CreateMMXElement(N.getOperand(i))); + Ops.push_back(CreateMMXElement(BV->getOperand(i))); } // Use tree of PUNPCKLs to build up general MMX vector. @@ -33152,14 +34931,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) { - if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + SDLoc dl(N); + if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); N0 = DAG.getBitcast(MVT::v8i1, N0); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, @@ -33170,7 +34949,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { - SDLoc dl(N); unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; @@ -33224,7 +35002,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8)) - return createMMXBuildVector(N0, DAG, Subtarget); + return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget); // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || @@ -33308,66 +35086,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Given a select, detect the following pattern: -// 1: %2 = zext <N x i8> %0 to <N x i32> -// 2: %3 = zext <N x i8> %1 to <N x i32> -// 3: %4 = sub nsw <N x i32> %2, %3 -// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] -// 5: %6 = sub nsw <N x i32> zeroinitializer, %4 -// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 +// Given a ABS node, detect the following pattern: +// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. -static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, - SDValue &Op1) { - // Check the condition of the select instruction is greater-than. - SDValue SetCC = Select->getOperand(0); - if (SetCC.getOpcode() != ISD::SETCC) - return false; - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT && CC != ISD::SETLT) - return false; - - SDValue SelectOp1 = Select->getOperand(1); - SDValue SelectOp2 = Select->getOperand(2); - - // The following instructions assume SelectOp1 is the subtraction operand - // and SelectOp2 is the negation operand. - // In the case of SETLT this is the other way around. - if (CC == ISD::SETLT) - std::swap(SelectOp1, SelectOp2); - - // The second operand of the select should be the negation of the first - // operand, which is implemented as 0 - SelectOp1. - if (!(SelectOp2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && - SelectOp2.getOperand(1) == SelectOp1)) - return false; - - // The first operand of SetCC is the first operand of the select, which is the - // difference between the two input vectors. - if (SetCC.getOperand(0) != SelectOp1) - return false; - - // In SetLT case, The second operand of the comparison can be either 1 or 0. - APInt SplatVal; - if ((CC == ISD::SETLT) && - !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && - SplatVal.isOneValue()) || - (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) +static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { + SDValue AbsOp1 = Abs->getOperand(0); + if (AbsOp1.getOpcode() != ISD::SUB) return false; - // In SetGT case, The second operand of the comparison can be either -1 or 0. - if ((CC == ISD::SETGT) && - !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || - ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) - return false; - - // The first operand of the select is the difference between the two input - // vectors. - if (SelectOp1.getOpcode() != ISD::SUB) - return false; - - Op0 = SelectOp1.getOperand(0); - Op1 = SelectOp1.getOperand(1); + Op0 = AbsOp1.getOperand(0); + Op1 = AbsOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || @@ -33487,23 +35215,25 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } -// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Bail without SSE2 or with AVX512VL (which uses predicate registers). - if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + // Bail without SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && - ExtractVT != MVT::i8) + ExtractVT != MVT::i8 && ExtractVT != MVT::i1) return SDValue(); - // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. ISD::NodeType BinOp; SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); + if (!Match && ExtractVT == MVT::i1) + Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); if (!Match) return SDValue(); @@ -33512,53 +35242,104 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); - // We require AVX2 for PMOVMSKB for v16i16/v32i8; - unsigned MatchSizeInBits = Match.getValueSizeInBits(); - if (!(MatchSizeInBits == 128 || - (MatchSizeInBits == 256 && - ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) - return SDValue(); + SDValue Movmsk; + SDLoc DL(Extract); + EVT MatchVT = Match.getValueType(); + unsigned NumElts = MatchVT.getVectorNumElements(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) - return SDValue(); + if (ExtractVT == MVT::i1) { + // Special case for (pre-legalization) vXi1 reductions. + if (NumElts > 32) + return SDValue(); + if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { + // If this is a legal AVX512 predicate type then we can just bitcast. + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = DAG.getBitcast(MovmskVT, Match); + } else { + // Use combineBitcastvxi1 to create the MOVMSK. + if (NumElts == 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + NumElts = 16; + } + EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); + } + if (!Movmsk) + return SDValue(); + Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); + } else { + // Bail with AVX512VL (which uses predicate registers). + if (Subtarget.hasVLX()) + return SDValue(); - // Check that we are extracting a reduction of all sign bits. - if (DAG.ComputeNumSignBits(Match) != BitWidth) - return SDValue(); + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && Subtarget.hasAVX()))) + return SDValue(); - // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. - MVT MaskVT; - if (64 == BitWidth || 32 == BitWidth) - MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), - MatchSizeInBits / BitWidth); - else - MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + // Make sure this isn't a vector of 1 element. The perf win from using + // MOVMSK diminishes with less elements in the reduction, but it is + // generally better to get the comparison over to the GPRs as soon as + // possible to reduce the number of vector ops. + if (Match.getValueType().getVectorNumElements() < 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); + Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); + MatchSizeInBits = Match.getValueSizeInBits(); + } + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskSrcVT; + if (64 == BitWidth || 32 == BitWidth) + MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); + Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); + NumElts = MaskSrcVT.getVectorNumElements(); + } + assert(NumElts <= 32 && "Not expecting more than 32 elements"); - APInt CompareBits; + if (BinOp == ISD::XOR) { + // parity -> (AND (CTPOP(MOVMSK X)), 1) + SDValue Mask = DAG.getConstant(1, DL, MVT::i32); + SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); + return DAG.getZExtOrTrunc(Result, DL, ExtractVT); + } + + SDValue CmpC; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 - CompareBits = APInt::getNullValue(32); + CmpC = DAG.getConstant(0, DL, MVT::i32); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) - CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); CondCode = ISD::CondCode::SETEQ; } - // Perform the select as i32/i64 and then truncate to avoid partial register - // stalls. - unsigned ResWidth = std::max(BitWidth, 32u); - EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); - SDLoc DL(Extract); - SDValue Zero = DAG.getConstant(0, DL, ResVT); - SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); - SDValue Res = DAG.getBitcast(MaskVT, Match); - Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); - Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), - Ones, Zero, CondCode); - return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + // The setcc produces an i8 of 0/1, so extend that to the result width and + // negate to get the final 0/-1 mask value. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetccVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); + SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); + SDValue Zero = DAG.getConstant(0, DL, ExtractVT); + return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, @@ -33603,7 +35384,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. - if (!Root || (Root.getOpcode() != ISD::VSELECT)) + if (!Root || Root.getOpcode() != ISD::ABS) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. @@ -33662,15 +35443,19 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) return SDValue(); + SDValue SrcBC = peekThroughBitcasts(Src); + // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. - if (X86ISD::VBROADCAST == Src.getOpcode() && - Src.getOperand(0).getValueType() == VT) - return Src.getOperand(0); + if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { + SDValue SrcOp = SrcBC.getOperand(0); + if (SrcOp.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, SrcOp); + } // Resolve the target shuffle inputs and mask. SmallVector<int, 16> Mask; SmallVector<SDValue, 2> Ops; - if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) + if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -33715,7 +35500,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[SrcIdx / Mask.size()]; - SrcOp = DAG.getBitcast(SrcVT, SrcOp); SrcIdx = SrcIdx % Mask.size(); // We can only extract other elements from 128-bit vectors and in certain @@ -33725,6 +35509,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { assert(SrcSVT == VT && "Unexpected extraction type"); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); } @@ -33734,6 +35519,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); @@ -33742,6 +35528,155 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Extracting a scalar FP value from vector element 0 is free, so extract each +/// operand first, then perform the math as a scalar op. +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); + SDValue Vec = ExtElt->getOperand(0); + SDValue Index = ExtElt->getOperand(1); + EVT VT = ExtElt->getValueType(0); + EVT VecVT = Vec.getValueType(); + + // TODO: If this is a unary/expensive/expand op, allow extraction from a + // non-zero element because the shuffle+scalar op will be cheaper? + if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) + return SDValue(); + + // Vector FP compares don't fit the pattern of FP math ops (propagate, not + // extract, the condition code), so deal with those as a special-case. + if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { + EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); + if (OpVT != MVT::f32 && OpVT != MVT::f64) + return SDValue(); + + // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, + Vec.getOperand(1), Index); + return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); + } + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Vector FP selects don't fit the pattern of FP math ops (because the + // condition has a different type and we have to change the opcode), so deal + // with those here. + // FIXME: This is restricted to pre type legalization by ensuring the setcc + // has i1 elements. If we loosen this we need to convert vector bool to a + // scalar bool. + if (Vec.getOpcode() == ISD::VSELECT && + Vec.getOperand(0).getOpcode() == ISD::SETCC && + Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && + Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { + // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) + SDLoc DL(ExtElt); + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Vec.getOperand(0).getValueType().getScalarType(), + Vec.getOperand(0), Index); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(1), Index); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + Vec.getOperand(2), Index); + return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); + } + + // TODO: This switch could include FNEG and the x86-specific FP logic ops + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // missed load folding and fma+fneg combining. + switch (Vec.getOpcode()) { + case ISD::FMA: // Begin 3 operands + case ISD::FMAD: + case ISD::FADD: // Begin 2 operands + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCOPYSIGN: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: + case X86ISD::FMAX: + case X86ISD::FMIN: + case ISD::FABS: // Begin 1 operand + case ISD::FSQRT: + case ISD::FRINT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case X86ISD::FRCP: + case X86ISD::FRSQRT: { + // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... + SDLoc DL(ExtElt); + SmallVector<SDValue, 4> ExtOps; + for (SDValue Op : Vec->ops()) + ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); + return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); + } + default: + return SDValue(); + } + llvm_unreachable("All opcodes should return within switch"); +} + +/// Try to convert a vector reduction sequence composed of binops and shuffles +/// into horizontal ops. +static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + SDValue Index = ExtElt->getOperand(1); + if (!isNullConstant(Index)) + return SDValue(); + + // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. + ISD::NodeType Opc; + SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + if (!Rdx) + return SDValue(); + + EVT VT = ExtElt->getValueType(0); + EVT VecVT = ExtElt->getOperand(0).getValueType(); + if (VecVT.getScalarType() != VT) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + SDLoc DL(ExtElt); + + // 256-bit horizontal instructions operate on 128-bit chunks rather than + // across the whole vector, so we need an extract + hop preliminary stage. + // This is the only step where the operands of the hop are not the same value. + // TODO: We could extend this to handle 512-bit or even longer vectors. + if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || + ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { + unsigned NumElts = VecVT.getVectorNumElements(); + SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); + SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); + VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + } + if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && + !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) + return SDValue(); + + // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 + assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); + unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); + for (unsigned i = 0; i != ReductionSteps; ++i) + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -33752,23 +35687,48 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); + SDLoc dl(InputVector); + bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; + + if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + + // Integer Constant Folding. + if (CIdx && VT.isInteger()) { + APInt UndefVecElts; + SmallVector<APInt, 16> EltBits; + unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); + if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, + EltBits, true, false)) { + uint64_t Idx = CIdx->getZExtValue(); + if (UndefVecElts[Idx]) + return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); + return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()), + dl, VT); + } + } + // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in: // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and // combineBasicSADPattern. - if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (IsPextr) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits( + SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) + return SDValue(N, 0); return SDValue(); + } if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; - SDValue InputVector = N->getOperand(0); - SDValue EltIdx = N->getOperand(1); - - EVT SrcVT = InputVector.getValueType(); - EVT VT = N->getValueType(0); - SDLoc dl(InputVector); - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { @@ -33789,16 +35749,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && - isa<ConstantSDNode>(EltIdx) && - isa<ConstantSDNode>(InputVector.getOperand(0))) { - uint64_t ExtractedElt = N->getConstantOperandVal(1); - auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0)); - const APInt &InputValue = InputC->getAPIntValue(); - uint64_t Res = InputValue[ExtractedElt]; - return DAG.getConstant(Res, dl, MVT::i1); - } - // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, @@ -33813,6 +35763,45 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget)) + return V; + + if (SDValue V = scalarizeExtEltFP(N, DAG)) + return V; + + // Attempt to extract a i1 element by using MOVMSK to extract the signbits + // and then testing the relevant element. + if (CIdx && SrcVT.getScalarType() == MVT::i1) { + SmallVector<SDNode *, 16> BoolExtracts; + auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { + if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(Use->getOperand(1)) && + Use->getValueType(0) == MVT::i1) { + BoolExtracts.push_back(Use); + return true; + } + return false; + }; + if (all_of(InputVector->uses(), IsBoolExtract) && + BoolExtracts.size() > 1) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); + if (SDValue BC = + combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { + for (SDNode *Use : BoolExtracts) { + // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask + unsigned MaskIdx = Use->getConstantOperandVal(1); + APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); + SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); + SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); + Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); + DCI.CombineTo(Use, Res); + } + return SDValue(N, 0); + } + } + } + return SDValue(); } @@ -33836,11 +35825,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, assert(CondVT.isVector() && "Vector select expects a vector selector!"); - bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. - if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && - CondVT.getVectorElementType() == MVT::i1) { + // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? + // TODO: Can we assert that both operands are not zeros (because that should + // get simplified at node creation time)? + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && + Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 @@ -33855,12 +35848,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); - bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); - bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - // Try to invert the condition if true value is not all 1s and false value is - // not all 0s. - if (!TValIsAllOnes && !FValIsAllZeros && + // not all 0s. Only do this if the condition has one use. + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. @@ -33918,6 +35909,39 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// If both arms of a vector select are concatenated vectors, split the select, +/// and concatenate the result to eliminate a wide (256-bit) vector instruction: +/// vselect Cond, (concat T0, T1), (concat F0, F1) --> +/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) +static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) + return SDValue(); + + // TODO: Split 512-bit vectors too? + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + // TODO: Split as long as any 2 of the 3 operands are concatenated? + SDValue Cond = N->getOperand(0); + SDValue TVal = N->getOperand(1); + SDValue FVal = N->getOperand(2); + SmallVector<SDValue, 4> CatOpsT, CatOpsF; + if (!TVal.hasOneUse() || !FVal.hasOneUse() || + !collectConcatOps(TVal.getNode(), CatOpsT) || + !collectConcatOps(FVal.getNode(), CatOpsF)) + return SDValue(); + + auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, + makeBlend, /*CheckBWI*/ false); +} + static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); @@ -33984,7 +36008,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { /// If this is a *dynamic* select (non-constant condition) and we can match /// this node with one of the variable blend instructions, restructure the /// condition so that blends can use the high (sign) bit of each element. -/// This function will also call SimplfiyDemandedBits on already created +/// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -34279,6 +36303,42 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } + // AVX512 - Extend select with zero to merge with target shuffle. + // select(mask, extract_subvector(shuffle(x)), zero) --> + // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) + // TODO - support non target shuffles as well. + if (Subtarget.hasAVX512() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + auto SelectableOp = [&TLI](SDValue Op) { + return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + isTargetShuffle(Op.getOperand(0).getOpcode()) && + isNullConstant(Op.getOperand(1)) && + TLI.isTypeLegal(Op.getOperand(0).getValueType()) && + Op.hasOneUse() && Op.getOperand(0).hasOneUse(); + }; + + bool SelectableLHS = SelectableOp(LHS); + bool SelectableRHS = SelectableOp(RHS); + bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { + EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() + : RHS.getOperand(0).getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts); + LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, + VT.getSizeInBits()); + RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, + VT.getSizeInBits()); + Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, + DAG.getUNDEF(SrcCondVT), Cond, + DAG.getIntPtrConstant(0, DL)); + SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); + return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); + } + } + if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; @@ -34349,14 +36409,16 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C - // TODO: Handle build_vectors with undef elements. auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { - return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); + return (!Op && !Cond) || + (Op && Cond && + Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) { - OpRHS = DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), OpRHS); + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, + /*AllowUndefs*/ true)) { + OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + OpRHS); return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } @@ -34443,6 +36505,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) + return V; + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); @@ -34726,7 +36791,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. -static SDValue combineCarryThroughADD(SDValue EFLAGS) { +static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { SDValue Carry = EFLAGS.getOperand(0); @@ -34739,8 +36804,34 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) { Carry = Carry.getOperand(0); if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { - if (Carry.getConstantOperandVal(0) == X86::COND_B) - return Carry.getOperand(1); + // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? + uint64_t CarryCC = Carry.getConstantOperandVal(0); + SDValue CarryOp1 = Carry.getOperand(1); + if (CarryCC == X86::COND_B) + return CarryOp1; + if (CarryCC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp + // instruction cannot take an immediate as its first operand. + // + if (CarryOp1.getOpcode() == X86ISD::SUB && + CarryOp1.getNode()->hasOneUse() && + CarryOp1.getValueType().isInteger() && + !isa<ConstantSDNode>(CarryOp1.getOperand(1))) { + SDValue SubCommute = + DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), + CarryOp1.getOperand(1), CarryOp1.getOperand(0)); + return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); + } + } + // If this is a check of the z flag of an add with 1, switch to the + // C flag. + if (CarryCC == X86::COND_E && + CarryOp1.getOpcode() == X86ISD::ADD && + isOneConstant(CarryOp1.getOperand(1))) + return CarryOp1; } } } @@ -34755,7 +36846,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) - if (SDValue Flags = combineCarryThroughADD(EFLAGS)) + if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) @@ -34774,6 +36865,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); + // cmov X, X, ?, ? --> X + if (TrueOp == FalseOp) + return TrueOp; + // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { @@ -35055,7 +37150,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. - bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize(); + bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); @@ -35294,8 +37389,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, @@ -35363,7 +37458,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) @@ -35500,7 +37595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt Mask = N0.getConstantOperandAPInt(1); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if @@ -35649,24 +37744,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineShift(SDNode* N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (N->getOpcode() == ISD::SHL) - if (SDValue V = combineShiftLeft(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRA) - if (SDValue V = combineShiftRightArithmetic(N, DAG)) - return V; - - if (N->getOpcode() == ISD::SRL) - if (SDValue V = combineShiftRightLogical(N, DAG, DCI)) - return V; - - return SDValue(); -} - static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35688,8 +37765,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector<APInt, 32> EltBits0, EltBits1; - if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && - (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && + if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && + (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; @@ -35761,10 +37838,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, // Attempt to combine as shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35777,11 +37851,22 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); + // Detect constant shift amounts. + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) { + unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); + return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, + EltBits[0].getZExtValue(), DAG); + } + APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); @@ -35840,9 +37925,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -35875,18 +37958,20 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - assert( - ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && - N->getValueType(0) == MVT::v8i16)) && - "Unexpected vector insertion"); + EVT VT = N->getValueType(0); + assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && + "Unexpected vector insertion"); + + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); - if (SDValue Res = - combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, - /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); @@ -35905,8 +37990,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - SDValue CMP0 = N0->getOperand(1); - SDValue CMP1 = N1->getOperand(1); + SDValue CMP0 = N0.getOperand(1); + SDValue CMP1 = N1.getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. @@ -35998,6 +38083,34 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Match (xor X, -1) -> X. +// Match extract_subvector(xor X, -1) -> extract_subvector(X). +// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). +static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + V = peekThroughBitcasts(V); + if (V.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + return V.getOperand(0); + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), + Not, V.getOperand(1)); + } + } + SmallVector<SDValue, 2> CatOps; + if (collectConcatOps(V.getNode(), CatOps)) { + for (SDValue &CatOp : CatOps) { + SDValue NotCat = IsNOT(CatOp, DAG); + if (!NotCat) return SDValue(); + CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); + } + return SDValue(); +} + /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); @@ -36007,15 +38120,14 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue X, Y; - SDValue N0 = peekThroughBitcasts(N->getOperand(0)); - SDValue N1 = peekThroughBitcasts(N->getOperand(1)); - if (N0.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) { - X = N0.getOperand(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (SDValue Not = IsNOT(N0, DAG)) { + X = Not; Y = N1; - } else if (N1.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) { - X = N1.getOperand(0); + } else if (SDValue Not = IsNOT(N1, DAG)) { + X = Not; Y = N0; } else return SDValue(); @@ -36057,7 +38169,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // The type of the truncated inputs. - if (N0->getOperand(0).getValueType() != VT) + if (N0.getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. @@ -36073,9 +38185,9 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. - N0 = N0->getOperand(0); + N0 = N0.getOperand(0); if (RHSTrunc) - N1 = N1->getOperand(0); + N1 = N1.getOperand(0); else N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); @@ -36099,34 +38211,35 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - unsigned FPOpcode = ISD::DELETED_NODE; - if (N->getOpcode() == ISD::AND) - FPOpcode = X86ISD::FAND; - else if (N->getOpcode() == ISD::OR) - FPOpcode = X86ISD::FOR; - else if (N->getOpcode() == ISD::XOR) - FPOpcode = X86ISD::FXOR; - - assert(FPOpcode != ISD::DELETED_NODE && - "Unexpected input node for FP logic conversion"); - EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - ((Subtarget.hasSSE1() && VT == MVT::i32) || - (Subtarget.hasSSE2() && VT == MVT::i64))) { - SDValue N00 = N0.getOperand(0); - SDValue N10 = N1.getOperand(0); - EVT N00Type = N00.getValueType(); - EVT N10Type = N10.getValueType(); - if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { - SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); - return DAG.getBitcast(VT, FPLogic); - } + + if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + + // Ensure that both types are the same and are legal scalar fp types. + if (N00Type != N10Type || + !((Subtarget.hasSSE1() && N00Type == MVT::f32) || + (Subtarget.hasSSE2() && N00Type == MVT::f64))) + return SDValue(); + + unsigned FPOpcode; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected input node for FP logic conversion"); + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; } - return SDValue(); + + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits @@ -36382,6 +38495,24 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineParity(N, DAG, Subtarget)) return V; + // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (Mask) { + APInt AllBits = APInt::getAllOnesValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ); + } + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -36403,9 +38534,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36451,6 +38580,52 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) +static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); + + EVT VT = N->getValueType(0); + if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) + return SDValue(); + + SDValue N0 = peekThroughBitcasts(N->getOperand(0)); + SDValue N1 = peekThroughBitcasts(N->getOperand(1)); + if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) + return SDValue(); + + // On XOP we'll lower to PCMOV so accept one use, otherwise only + // do this if either mask has multiple uses already. + if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || + !N1.getOperand(1).hasOneUse())) + return SDValue(); + + // Attempt to extract constant byte masks. + APInt UndefElts0, UndefElts1; + SmallVector<APInt, 32> EltBits0, EltBits1; + if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, + false, false)) + return SDValue(); + if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, + false, false)) + return SDValue(); + + for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { + // TODO - add UNDEF elts support. + if (UndefElts0[i] || UndefElts1[i]) + return SDValue(); + if (EltBits0[i] != ~EltBits1[i]) + return SDValue(); + } + + SDLoc DL(N); + SDValue X = N->getOperand(0); + SDValue Y = + DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), + DAG.getBitcast(VT, N1.getOperand(0))); + return DAG.getNode(ISD::OR, DL, VT, X, Y); +} + // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { if (N->getOpcode() != ISD::OR) @@ -36483,6 +38658,68 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { return true; } +// Try to match: +// (or (and (M, (sub 0, X)), (pandn M, X))) +// which is a special case of vselect: +// (vselect M, (sub 0, X), X) +// Per: +// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate +// We know that, if fNegate is 0 or 1: +// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) +// +// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: +// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) +// ( M ? -X : X) == ((X ^ M ) + (M & 1)) +// This lets us transform our vselect to: +// (add (xor X, M), (and M, 1)) +// And further to: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoConditionalNegate( + EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + EVT MaskVT = Mask.getValueType(); + assert(MaskVT.isInteger() && + DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && + "Mask must be zero/all-bits"); + + if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) + return SDValue(); + if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) + return SDValue(); + + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; + + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + else + return SDValue(); + + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); +} + // Try to fold: // (or (and (m, y), (pandn m, x))) // into: @@ -36518,55 +38755,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); - // Try to match: - // (or (and (M, (sub 0, X)), (pandn M, X))) - // which is a special case of vselect: - // (vselect M, (sub 0, X), X) - // Per: - // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate - // We know that, if fNegate is 0 or 1: - // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) - // - // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: - // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) - // ( M ? -X : X) == ((X ^ M ) + (M & 1)) - // This lets us transform our vselect to: - // (add (xor X, M), (and M, 1)) - // And further to: - // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && - DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { - auto IsNegV = [](SDNode *N, SDValue V) { - return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && - ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); - }; - SDValue V; - if (IsNegV(Y.getNode(), X)) - V = X; - else if (IsNegV(X.getNode(), Y)) - V = Y; - - if (V) { - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); - SDValue SubOp2 = Mask; - - // If the negate was on the false side of the select, then - // the operands of the SUB need to be swapped. PR 27251. - // This is because the pattern being matched above is - // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) - // but if the pattern matched was - // (vselect M, X, (sub (0, X))), that is really negation of the pattern - // above, -(vselect M, (sub 0, X), X), and therefore the replacement - // pattern also needs to be a negation of the replacement pattern above. - // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the - // sub accomplishes the negation of the replacement pattern. - if (V == Y) - std::swap(SubOp1, SubOp2); - - SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); - return DAG.getBitcast(VT, Res); - } - } + // Attempt to combine to conditional negate: (sub (xor X, M), M) + if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, + DAG, Subtarget)) + return Res; // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) @@ -36676,8 +38868,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); - EVT VT = OR->getValueType(0); - SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); @@ -36713,15 +38904,16 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; + if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) + return R; + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -36729,7 +38921,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some @@ -36758,14 +38950,14 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SDValue ShMsk0; if (ShAmt0.getOpcode() == ISD::AND && isa<ConstantSDNode>(ShAmt0.getOperand(1)) && - ShAmt0.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk0 = ShAmt0; ShAmt0 = ShAmt0.getOperand(0); } SDValue ShMsk1; if (ShAmt1.getOpcode() == ISD::AND && isa<ConstantSDNode>(ShAmt1.getOperand(1)) && - ShAmt1.getConstantOperandVal(1) == (Bits - 1)) { + ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk1 = ShAmt1; ShAmt1 = ShAmt1.getOperand(0); } @@ -36776,46 +38968,55 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); - unsigned Opc = X86ISD::SHLD; + unsigned Opc = ISD::FSHL; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB || - ShAmt0.getOpcode() == ISD::XOR) { - Opc = X86ISD::SHRD; + if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { + Opc = ISD::FSHR; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); std::swap(ShMsk0, ShMsk1); } - // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) - // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C ) - // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C ) + auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, + SDValue Amt) { + if (Opc == ISD::FSHR) + std::swap(Op0, Op1); + return DAG.getNode(Opc, DL, VT, Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); + }; + + // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) + // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C ) + // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C ) if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getOpcode() == ISD::AND && + isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) && + ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) { + ShMsk1 = ShAmt1Op1; + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + } if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if ((SumC->getAPIntValue() == Bits || (SumC->getAPIntValue() == 0 && ShMsk1)) && ShAmt1Op1 == ShAmt0) - return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) - return DAG.getNode(Opc, DL, VT, - N0.getOperand(0), N1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, - MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1, ShAmt0); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) { - unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); + unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) ShAmt1Op0 = ShAmt1Op0.getOperand(0); @@ -36823,15 +39024,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa<ConstantSDNode>(Op1.getOperand(1)) && - Op1.getConstantOperandVal(1) == 1) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + Op1.getConstantOperandAPInt(1) == 1) { + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && Op1.getOperand(0) == Op1.getOperand(1)) { - return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } } } @@ -36873,7 +39072,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // Make sure the shift amount extracts the sign bit. if (!isa<ConstantSDNode>(Shift.getOperand(1)) || - Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. @@ -36926,13 +39125,10 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return SDValue(); // The shift should be smearing the sign bit across each vector element. - auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1)); - if (!ShiftBV) - return SDValue(); - - EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); - auto *ShiftAmt = ShiftBV->getConstantSplatNode(); - if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + auto *ShiftAmt = + isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); + if (!ShiftAmt || + ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious @@ -37214,15 +39410,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, AVGBuilder); } - if (Operands[0].getOpcode() == ISD::ADD) + // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). + // Match the or case only if its 'add-like' - can be replaced by an add. + auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { + if (ISD::ADD == V.getOpcode()) { + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + } + if (ISD::ZERO_EXTEND != V.getOpcode()) + return false; + V = V.getOperand(0); + if (V.getValueType() != VT || ISD::OR != V.getOpcode() || + !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) + return false; + Op0 = V.getOperand(0); + Op1 = V.getOperand(1); + return true; + }; + + SDValue Op0, Op1; + if (FindAddLike(Operands[0], Op0, Op1)) std::swap(Operands[0], Operands[1]); - else if (Operands[1].getOpcode() != ISD::ADD) + else if (!FindAddLike(Operands[1], Op0, Op1)) return SDValue(); - Operands[2] = Operands[1].getOperand(0); - Operands[1] = Operands[1].getOperand(1); + Operands[2] = Op0; + Operands[1] = Op1; // Now we have three operands of two additions. Check that one of them is a - // constant vector with ones, and the other two are promoted from i8/i16. + // constant vector with ones, and the other two can be promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; @@ -37230,14 +39446,16 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) - if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || - Operands[j].getOperand(0).getValueType() != VT) - return SDValue(); + if (Operands[j].getValueType() != VT) { + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + Operands[j] = Operands[j].getOperand(0); + } // The pattern is detected, emit X86ISD::AVG instruction(s). - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Operands[0].getOperand(0), - Operands[1].getOperand(0) }, AVGBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]}, + AVGBuilder); } return SDValue(); @@ -37257,38 +39475,51 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; - unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, - AddressSpace, Alignment, &Fast) && !Fast))) { + *Ld->getMemOperand(), &Fast) && + !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Ptr = Ld->getBasePtr(); - + unsigned HalfAlign = 16; + SDValue Ptr1 = Ld->getBasePtr(); + SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - NumElems/2); + NumElems / 2); SDValue Load1 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); - - Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); - SDValue Load2 = - DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags()); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, + Ld->getPointerInfo().getWithOffset(HalfAlign), + MinAlign(Alignment, HalfAlign), + Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Load1.getValue(1), - Load2.getValue(1)); + Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); return DCI.CombineTo(N, NewVec, TF, true); } + // Bool vector load - attempt to cast to an integer, as we have good + // (vXiY *ext(vXi1 bitcast(iX))) handling. + if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && + RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { + unsigned NumElts = RegVT.getVectorNumElements(); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + if (TLI.isTypeLegal(IntVT)) { + SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Alignment, + Ld->getMemOperand()->getFlags()); + SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); + return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); + } + } + return SDValue(); } @@ -37415,6 +39646,9 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, if (ML->getPassThru().isUndef()) return SDValue(); + if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) + return SDValue(); + // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), @@ -37445,7 +39679,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::SEXTLOAD) + if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. @@ -37515,8 +39749,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); - return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -37554,6 +39800,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; @@ -37562,7 +39812,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -37572,20 +39821,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + return SDValue(); } // Resolve truncating stores. unsigned NumElems = VT.getVectorNumElements(); - EVT StVT = Mst->getMemoryVT(); - SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. @@ -37655,11 +39909,13 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast<StoreSDNode>(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); + unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -37710,8 +39966,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); - unsigned Alignment = St->getAlignment(); - SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl); @@ -37735,30 +39989,48 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; - unsigned AddressSpace = St->getAddressSpace(); - unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AddressSpace, Alignment, &Fast) && + *St->getMemOperand(), &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); - SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); - SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); + return splitVectorStore(St, DAG); + } - SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); + // Split under-aligned vector non-temporal stores. + if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { + // ZMM/YMM nt-stores - either it can be stored as a series of shorter + // vectors or the legalizer can scalarize it to use MOVNTI. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + return splitVectorStore(St, DAG); + } - SDValue Ch0 = - DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), - Alignment, St->getMemOperand()->getFlags()); - SDValue Ch1 = - DAG.getStore(St->getChain(), dl, Value1, Ptr1, - St->getPointerInfo().getWithOffset(16), - MinAlign(Alignment, 16U), St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 + // to use MOVNTI. + if (VT.is128BitVector() && Subtarget.hasSSE2()) { + MVT NTVT = Subtarget.hasSSE4A() + ? MVT::v2f64 + : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); + return scalarizeVectorStore(St, NTVT, DAG); + } + } + + // Try to optimize v16i16->v16i8 truncating stores when BWI is not + // supported, but avx512f is by extending to v16i32 and truncating. + if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && + St->getValue().getOpcode() == ISD::TRUNCATE && + St->getValue().getOperand(0).getValueType() == MVT::v16i16 && + TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && + !DCI.isBeforeLegalizeOps()) { + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), + MVT::v16i8, St->getMemOperand()); } // Optimize trunc store (of multiple scalars) to shuffle and store. @@ -37774,7 +40046,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue Val = detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, TLI)) @@ -37878,7 +40149,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if ((VT.isVector() || + if (((VT.isVector() && !VT.isFloatingPoint()) || (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa<LoadSDNode>(St->getValue()) && !cast<LoadSDNode>(St->getValue())->isVolatile() && @@ -37901,8 +40172,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = (Subtarget.is64Bit() && - (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); @@ -37976,7 +40246,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool IsCommutative) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -37990,51 +40262,83 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. - // At least one of the operands should be a vector shuffle. - if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && - RHS.getOpcode() != ISD::VECTOR_SHUFFLE) - return false; - MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); + unsigned NumElts = VT.getVectorNumElements(); + + // TODO - can we make a general helper method that does all of this for us? + auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, + SmallVectorImpl<int> &ShuffleMask) { + if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (!Op.getOperand(0).isUndef()) + N0 = Op.getOperand(0); + if (!Op.getOperand(1).isUndef()) + N1 = Op.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); + ShuffleMask.append(Mask.begin(), Mask.end()); + return; + } + bool UseSubVector = false; + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Op.getOperand(0).getValueType().is256BitVector() && + llvm::isNullConstant(Op.getOperand(1))) { + Op = Op.getOperand(0); + UseSubVector = true; + } + bool IsUnary; + SmallVector<SDValue, 2> SrcOps; + SmallVector<int, 16> SrcShuffleMask; + SDValue BC = peekThroughBitcasts(Op); + if (isTargetShuffle(BC.getOpcode()) && + getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, + SrcOps, SrcShuffleMask, IsUnary)) { + if (!UseSubVector && SrcShuffleMask.size() == NumElts && + SrcOps.size() <= 2) { + N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); + N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); + ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); + } + if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) && + SrcOps.size() == 1) { + N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op)); + N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op)); + ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts); + ShuffleMask.append(Mask.begin(), Mask.end()); + } + } + }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. - unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; - SmallVector<int, 16> LMask(NumElts); - if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!LHS.getOperand(0).isUndef()) - A = LHS.getOperand(0); - if (!LHS.getOperand(1).isUndef()) - B = LHS.getOperand(1); - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); - llvm::copy(Mask, LMask.begin()); - } else { - A = LHS; - for (unsigned i = 0; i != NumElts; ++i) - LMask[i] = i; - } + SmallVector<int, 16> LMask; + GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 16> RMask(NumElts); - if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!RHS.getOperand(0).isUndef()) - C = RHS.getOperand(0); - if (!RHS.getOperand(1).isUndef()) - D = RHS.getOperand(1); - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); - llvm::copy(Mask, RMask.begin()); - } else { + SmallVector<int, 16> RMask; + GetShuffle(RHS, C, D, RMask); + + // At least one of the operands should be a vector shuffle. + unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); + if (NumShuffles == 0) + return false; + + if (LMask.empty()) { + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask.push_back(i); + } + + if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) - RMask[i] = i; + RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting @@ -38083,6 +40387,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + + if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + return false; + + LHS = DAG.getBitcast(VT, LHS); + RHS = DAG.getBitcast(VT, RHS); return true; } @@ -38099,8 +40409,7 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd) && - shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); return SDValue(); @@ -38116,7 +40425,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); - unsigned Opcode = Src.getOpcode(); + unsigned SrcOpcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); @@ -38144,7 +40453,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); - return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); + return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. @@ -38159,13 +40468,13 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. - switch (Opcode) { + switch (SrcOpcode) { case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegalOrPromote(Opcode, VT) && + if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38174,14 +40483,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. - if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && - !TLI.isOperationLegal(Opcode, SrcVT)) + if (SrcVT.getScalarType() == MVT::i64 && + TLI.isOperationLegal(SrcOpcode, VT) && + !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; @@ -38191,7 +40501,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // truncatable to avoid interfering with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(Opcode, VT) && + if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) return TruncateArithmetic(Op0, Op1); break; @@ -38202,36 +40512,19 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, } /// Truncate using ISD::AND mask and X86ISD::PACKUS. +/// e.g. trunc <8 x i32> X to <8 x i16> --> +/// MaskX = X & 0xffff (clear high bits to prevent saturation) +/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); - EVT InSVT = InVT.getVectorElementType(); EVT OutVT = N->getValueType(0); - EVT OutSVT = OutVT.getVectorElementType(); - - // Split a long vector into vectors of legal type and mask to unset all bits - // that won't appear in the result to prevent saturation. - // TODO - we should be doing this at the maximum legal size but this is - // causing regressions where we're concatenating back to max width just to - // perform the AND and then extracting back again..... - unsigned NumSubRegs = InVT.getSizeInBits() / 128; - unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); - EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); - SmallVector<SDValue, 8> SubVecs(NumSubRegs); - - APInt Mask = - APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); - SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT); - - for (unsigned i = 0; i < NumSubRegs; i++) { - SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, - DAG.getIntPtrConstant(i * NumSubRegElts, DL)); - SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal); - } - In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs); + APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget); } @@ -38594,16 +40887,23 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); + unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); + SDValue Op = peekThroughBitcasts(SDValue(N, 0)); - auto VT = Op->getValueType(0); + EVT VT = Op->getValueType(0); + // Make sure the element size does't change. + if (VT.getScalarSizeInBits() != ScalarSize) + return SDValue(); + if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) - return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), - SVOp->getMask()); + if (NegOp0.getValueType() == VT) // FIXME: Can we do better? + return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), + SVOp->getMask()); return SDValue(); } unsigned Opc = Op.getOpcode(); @@ -38615,19 +40915,17 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (!InsVector.isUndef()) return SDValue(); if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) - return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, - NegInsVal, Op.getOperand(2)); + if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, + NegInsVal, Op.getOperand(2)); return SDValue(); } if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) return SDValue(); - SDValue Op1 = peekThroughBitcasts(Op.getOperand(1)); - if (!Op1.getValueType().isFloatingPoint()) - return SDValue(); - - SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); + SDValue Op1 = Op.getOperand(1); + SDValue Op0 = Op.getOperand(0); // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit // masks. For FSUB, we have to check if constant bits of Op0 are sign bit @@ -38639,7 +40937,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { SmallVector<APInt, 16> EltBits; // Extract constant bits and see if they are all sign bit masks. Ignore the // undef elements. - if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(), + if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) { @@ -38936,13 +41234,12 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (Subtarget.useSoftFloat()) return SDValue(); - // TODO: If an operand is already known to be a NaN or not a NaN, this - // should be an optional swap and FMAX/FMIN. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); - if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || - (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64) || + (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue Op0 = N->getOperand(0); @@ -38955,13 +41252,20 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + // If one of the operands is known non-NaN use the native min/max instructions + // with the non-NaN input as second operand. + if (DAG.isKnownNeverNaN(Op1)) + return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); + if (DAG.isKnownNeverNaN(Op0)) + return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); + // If we have to respect NaN inputs, this takes at least 3 instructions. // Favor a library call when operating on a scalar and minimizing code size. - if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) + if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); - EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( - DAG.getDataLayout(), *DAG.getContext(), VT); + EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: @@ -39001,6 +41305,69 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, KnownZero, DCI)) return SDValue(N, 0); + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getIntegerVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + + return SDValue(); +} + +static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getFloatingPointVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + return SDValue(); } @@ -39019,18 +41386,14 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. - if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) { - return DAG.getNode(ISD::AND, SDLoc(N), VT, - N->getOperand(0).getOperand(0), N->getOperand(1)); - } + if (SDValue Not = IsNOT(N->getOperand(0), DAG)) + return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), + N->getOperand(1)); // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively( - {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } @@ -39053,18 +41416,24 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + + EVT DstVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); - if (ExtraVT != MVT::i16) + if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) return SDValue(); - // Look through single use any_extends. - if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse()) + // Look through single use any_extends / truncs. + SDValue IntermediateBitwidthOp; + if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && + N0.hasOneUse()) { + IntermediateBitwidthOp = N0; N0 = N0.getOperand(0); + } // See if we have a single use cmov. if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) @@ -39080,21 +41449,37 @@ static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // If we looked through an any_extend above, add one to the constants. - if (N0.getValueType() != VT) { - CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0); - CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1); + // If we looked through an any_extend/trunc above, add one to the constants. + if (IntermediateBitwidthOp) { + unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); + CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); + CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); + } + + CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); + CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); + + EVT CMovVT = DstVT; + // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. + if (DstVT == MVT::i16) { + CMovVT = MVT::i32; + CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); + CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); } - CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1); - CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1); + SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, + N0.getOperand(2), N0.getOperand(3)); + + if (CMovVT != DstVT) + CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); - return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1, - N0.getOperand(2), N0.getOperand(3)); + return CMov; } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + if (SDValue V = combineSextInRegCmov(N, DAG)) return V; @@ -39350,6 +41735,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, return SDValue(); unsigned Opcode = N->getOpcode(); + // TODO - add ANY_EXTEND support. if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) @@ -39396,13 +41782,13 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { - EVT InVT = N.getValueType(); - EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), - Size / InVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), - DAG.getUNDEF(InVT)); + EVT SrcVT = N.getValueType(); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + Size / SrcVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(), + DAG.getUNDEF(SrcVT)); Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend @@ -39424,8 +41810,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, ExOp); } @@ -39435,9 +41820,7 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG; - + unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); SmallVector<SDValue, 8> Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, @@ -39471,7 +41854,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Only do this combine with AVX512 for vector extends. - if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC) + if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) return SDValue(); // Only combine legal element types. @@ -39487,7 +41870,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since // that's the only integer compares with we have. - ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get(); + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); if (ISD::isUnsignedIntSetCC(CC)) return SDValue(); @@ -39643,6 +42026,10 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, if (!NegVal) return SDValue(); + // FIXME: Should we bitcast instead? + if (NegVal.getValueType() != VT) + return SDValue(); + unsigned NewOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); @@ -39719,6 +42106,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; + // TODO: Combine with any target/faux shuffle. + if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && + VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); + unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); + APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); + if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && + (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { + return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); + } + } + return SDValue(); } @@ -39748,9 +42149,14 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if (isNullConstant(Y) && !IsOrXorXorCCZero) return SDValue(); - // Bail out if we know that this is not really just an oversized integer. - if (peekThroughBitcasts(X).getValueType() == MVT::f128 || - peekThroughBitcasts(Y).getValueType() == MVT::f128) + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa<ConstantSDNode>(X) || X.getValueType().isVector() || + X.getOpcode() == ISD::LOAD; + }; + if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && + !IsOrXorXorCCZero) return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? @@ -39887,66 +42293,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); + unsigned NumBits = VT.getScalarSizeInBits(); + unsigned NumElts = SrcVT.getVectorNumElements(); // Perform constant folding. if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { - assert(VT== MVT::i32 && "Unexpected result type"); + assert(VT == MVT::i32 && "Unexpected result type"); APInt Imm(32, 0); for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { - SDValue In = Src.getOperand(Idx); - if (!In.isUndef() && - cast<ConstantSDNode>(In)->getAPIntValue().isNegative()) + if (!Src.getOperand(Idx).isUndef() && + Src.getConstantOperandAPInt(Idx).isNegative()) Imm.setBit(Idx); } return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() && - SrcVT.isFloatingPoint() && - Src.getOperand(0).getValueType() == - EVT(SrcVT).changeVectorElementTypeToInteger()) - Src = Src.getOperand(0); + unsigned EltWidth = SrcVT.getScalarSizeInBits(); + if (Src.getOpcode() == ISD::BITCAST && + Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) + return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); + + // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results + // with scalar comparisons. + if (SDValue NotSrc = IsNOT(Src, DAG)) { + SDLoc DL(N); + APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); + NotSrc = DAG.getBitcast(SrcVT, NotSrc); + return DAG.getNode(ISD::XOR, DL, VT, + DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), + DAG.getConstant(NotMask, DL, VT)); + } // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + APInt DemandedMask(APInt::getAllOnesValue(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); - // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)). - // Only do this when the setcc input and output types are the same and the - // setcc and the 'and' node have a single use. - // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't. - APInt SplatVal; - if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && - Src.getOperand(0).getValueType() == Src.getValueType() && - cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE && - ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && - Src.getOperand(0).getOpcode() == ISD::AND) { - SDValue And = Src.getOperand(0); - if (And.hasOneUse() && - ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) && - SplatVal.isPowerOf2()) { - MVT VT = Src.getSimpleValueType(); - unsigned BitWidth = VT.getScalarSizeInBits(); - unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1; - SDLoc DL(And); - SDValue X = And.getOperand(0); - // If the element type is i8, we need to bitcast to i16 to use a legal - // shift. If we wait until lowering we end up with an extra and to bits - // from crossing the 8-bit elements, but we don't care about that here. - if (VT.getVectorElementType() == MVT::i8) { - VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); - X = DAG.getBitcast(VT, X); - } - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(ShAmt, DL, VT)); - SDValue Cast = DAG.getBitcast(SrcVT, Shl); - return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast); - } - } - return SDValue(); } @@ -40079,8 +42463,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. - if (BuildVectorSDNode *BV = - dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { + if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); @@ -40102,6 +42485,41 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, return SDValue(); } +/// If we are converting a value to floating-point, try to replace scalar +/// truncate of an extracted vector element with a bitcast. This tries to keep +/// the sequence on XMM registers rather than moving between vector and GPRs. +static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { + // TODO: This is currently only used by combineSIntToFP, but it is generalized + // to allow being called by any similar cast opcode. + // TODO: Consider merging this into lowering: vectorizeExtractedCast(). + SDValue Trunc = N->getOperand(0); + if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + SDValue ExtElt = Trunc.getOperand(0); + if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isNullConstant(ExtElt.getOperand(1))) + return SDValue(); + + EVT TruncVT = Trunc.getValueType(); + EVT SrcVT = ExtElt.getValueType(); + unsigned DestWidth = TruncVT.getSizeInBits(); + unsigned SrcWidth = SrcVT.getSizeInBits(); + if (SrcWidth % DestWidth != 0) + return SDValue(); + + // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) + EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); + unsigned VecWidth = SrcVecVT.getSizeInBits(); + unsigned NumElts = VecWidth / DestWidth; + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); + SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); + SDLoc DL(N); + SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, + BitcastVec, ExtElt.getOperand(1)); + return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); +} + static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -40195,6 +42613,10 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return FILDChain; } } + + if (SDValue V = combineToFPTruncExtElt(N, DAG)) + return V; + return SDValue(); } @@ -40281,13 +42703,13 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) && onlyZeroFlagUsed(SDValue(N, 0))) { - EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); - unsigned ShAmt = Op.getConstantOperandVal(1); - if (ShAmt < BitWidth) { // Avoid undefined shifts. + const APInt &ShAmt = Op.getConstantOperandAPInt(1); + if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. + unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); APInt Mask = Op.getOpcode() == ISD::SRL - ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) - : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + ? APInt::getHighBitsSet(BitWidth, MaskBits) + : APInt::getLowBitsSet(BitWidth, MaskBits); if (Mask.isSignedIntN(32)) { Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -40297,7 +42719,6 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { } } - // Look for a truncate with a single use. if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse()) return SDValue(); @@ -40351,8 +42772,42 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { return Op.getValue(1); } +static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && + "Expected X86ISD::ADD or X86ISD::SUB"); + + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + MVT VT = LHS.getSimpleValueType(); + unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB; + + // If we don't use the flag result, simplify back to a generic ADD/SUB. + if (!N->hasAnyUseOfValue(1)) { + SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); + return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); + } + + // Fold any similar generic ADD/SUB opcodes to reuse this node. + auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + SDValue Ops[] = {N0, N1}; + SDVTList VTs = DAG.getVTList(N->getValueType(0)); + if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + SDValue Op(N, 0); + if (Negate) + Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); + DCI.CombineTo(GenericAddSub, Op); + } + }; + MatchGeneric(LHS, RHS, false); + MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + + return SDValue(); +} + static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, @@ -40360,6 +42815,15 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { Flags); } + // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) + // iff the flag result is dead. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) && + !N->hasAnyUseOfValue(1)) + return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0), + Op0.getOperand(1), N->getOperand(2)); + return SDValue(); } @@ -40386,7 +42850,7 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, Res1, CarryOut); } - if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, @@ -40482,7 +42946,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), @@ -40589,8 +43053,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops); + MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; auto BuildPMADDWD = [&](SDValue Mul) { @@ -40645,10 +43109,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, return SDValue(); // We know N is a reduction add, which means one of its operands is a phi. - // To match SAD, we need the other operand to be a vector select. - if (Op0.getOpcode() != ISD::VSELECT) + // To match SAD, we need the other operand to be a ABS. + if (Op0.getOpcode() != ISD::ABS) std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::VSELECT) + if (Op0.getOpcode() != ISD::ABS) return SDValue(); auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { @@ -40687,7 +43151,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, Op0 = BuildPSADBW(SadOp0, SadOp1); // It's possible we have a sad on the other side too. - if (Op1.getOpcode() == ISD::VSELECT && + if (Op1.getOpcode() == ISD::ABS && detectZextAbsDiff(Op1, SadOp0, SadOp1)) { Op1 = BuildPSADBW(SadOp0, SadOp1); } @@ -40829,39 +43293,6 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, PMADDBuilder); } -// Try to turn (add (umax X, C), -C) into (psubus X, C) -static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // psubus is available in SSE2 for i8 and i16 vectors. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || - !isPowerOf2_32(VT.getVectorNumElements()) || - !(VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16)) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - if (Op0.getOpcode() != ISD::UMAX) - return SDValue(); - - // The add should have a constant that is the negative of the max. - // TODO: Handle build_vectors with undef elements. - auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { - return Max->getAPIntValue() == (-Op->getAPIntValue()); - }; - if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT)) - return SDValue(); - - SDLoc DL(N); - return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0), - Op0.getOperand(1)); -} - // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -40971,12 +43402,12 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, ArrayRef<SDValue> Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. - EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i16 && + EVT OpVT = Ops[0].getValueType(); + assert(OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"); - assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); + assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - InVT.getVectorNumElements() / 2); + OpVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, @@ -41004,8 +43435,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -41017,9 +43448,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; - if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) - return V; - return combineAddOrSubToADCOrSBB(N, DAG); } @@ -41124,7 +43552,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa<ConstantSDNode>(Op1.getOperand(1))) { - APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); + const APInt &XorC = Op1.getConstantOperandAPInt(1); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), @@ -41138,8 +43566,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); @@ -41173,6 +43601,149 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Helper that combines an array of subvector ops as if they were the operands +/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. +/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. +static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, + ArrayRef<SDValue> Ops, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); + + if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) + return DAG.getUNDEF(VT); + + if (llvm::all_of(Ops, [](SDValue Op) { + return ISD::isBuildVectorAllZeros(Op.getNode()); + })) + return getZeroVector(VT, Subtarget, DAG, DL); + + SDValue Op0 = Ops[0]; + + // Fold subvector loads into one. + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) { + bool Fast; + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstLd->getMemOperand(), &Fast) && + Fast) { + if (SDValue Ld = + EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + return Ld; + } + } + + // Repeated subvectors. + if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) { + // If this broadcast/subv_broadcast is inserted into both halves, use a + // larger broadcast/subv_broadcast. + if (Op0.getOpcode() == X86ISD::VBROADCAST || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST) + return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); + + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) + if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && + (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, + Op0.getOperand(0), + DAG.getIntPtrConstant(0, DL))); + + // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) + if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Subtarget.hasAVX2() || + (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + Op0.getOperand(0).getValueType() == VT.getScalarType()) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); + } + + bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); + + // Repeated opcode. + // TODO - combineX86ShufflesRecursively should handle shuffle concatenation + // but it currently struggles with different vector widths. + if (llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getOpcode() == Op0.getOpcode(); + })) { + unsigned NumOps = Ops.size(); + switch (Op0.getOpcode()) { + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFD: + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && + Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(Ops[i].getOperand(0)); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), + Op0.getOperand(1)); + } + LLVM_FALLTHROUGH; + case X86ISD::VPERMILPI: + // TODO - add support for vXf64/vXi64 shuffles. + if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) && + Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0))); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src); + Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res, + Op0.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + break; + case X86ISD::PACKUS: + if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) { + SmallVector<SDValue, 2> LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); + } + break; + } + } + + // If we're inserting all zeros into the upper half, change this to + // an insert into an all zeros vector. We will match this to a move + // with implicit upper bit zeroing during isel. + if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), Ops[0], + DAG.getIntPtrConstant(0, DL)); + + return SDValue(); +} + +static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + EVT SrcVT = N->getOperand(0).getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Don't do anything for i1 vectors. + if (VT.getVectorElementType() == MVT::i1) + return SDValue(); + + if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { + SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, + DCI, Subtarget)) + return R; + } + + return SDValue(); +} + static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -41187,19 +43758,23 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); - unsigned IdxVal = N->getConstantOperandVal(2); + uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); - if (ISD::isBuildVectorAllZeros(Vec.getNode())) { - // Inserting zeros into zeros is a nop. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, dl); + if (Vec.isUndef() && SubVec.isUndef()) + return DAG.getUNDEF(OpVT); + // Inserting undefs/zeros into zeros/undefs is a zero vector. + if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && + (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { - unsigned Idx2Val = SubVec.getConstantOperandVal(2); + uint64_t Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec.getOperand(1), @@ -41211,30 +43786,16 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && - SubVec.getConstantOperandVal(1) == 0 && + SubVec.getConstantOperandAPInt(1) == 0 && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); - if (Ins.getConstantOperandVal(2) == 0 && + if (Ins.getConstantOperandAPInt(2) == 0 && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); } - - // If we're inserting a bitcast into zeros, rewrite the insert and move the - // bitcast to the other side. This helps with detecting zero extending - // during isel. - // TODO: Is this useful for other indices than 0? - if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) { - MVT CastVT = SubVec.getOperand(0).getSimpleValueType(); - unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits(); - MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems); - SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, - DAG.getBitcast(NewVT, Vec), - SubVec.getOperand(0), N->getOperand(2)); - return DAG.getBitcast(OpVT, Insert); - } } // Stop here if this is an i1 vector. @@ -41262,77 +43823,92 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, } } - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) + // Match concat_vector style patterns. + SmallVector<SDValue, 2> SubVectorOps; + if (collectConcatOps(N, SubVectorOps)) + if (SDValue Fold = + combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) + return Fold; + + // If we are inserting into both halves of the vector, the starting vector + // should be undef. If it isn't, make it so. Only do this if the early insert + // has no other uses. + // TODO: Should this be a generic DAG combine? + // TODO: Why doesn't SimplifyDemandedVectorElts catch this? if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - if (isNullConstant(Vec.getOperand(2))) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, - Subtarget, false)) - return Ld; - } - } - // If lower/upper loads are the same and there's no other use of the lower - // load, then splat the loaded value with a broadcast. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse()) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - - // If we're inserting all zeros into the upper half, change this to - // an insert into an all zeros vector. We will match this to a move - // with implicit upper bit zeroing during isel. - if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, - getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2, - Vec.getOperand(2)); + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && + isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && + Vec.hasOneUse()) { + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), + Vec.getOperand(1), Vec.getOperand(2)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, + N->getOperand(2)); + } - // If we are inserting into both halves of the vector, the starting - // vector should be undef. If it isn't, make it so. Only do this if the - // the early insert has no other uses. - // TODO: Should this be a generic DAG combine? - if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) { - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), - SubVec2, Vec.getOperand(2)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, - N->getOperand(2)); + // If this is a broadcast insert into an upper undef, use a larger broadcast. + if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) + return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); - } - } + return SDValue(); +} + +/// If we are extracting a subvector of a vector select and the select condition +/// is composed of concatenated vectors, try to narrow the select width. This +/// is a common pattern for AVX1 integer code because 256-bit selects may be +/// legal, but there is almost no integer math/logic available for 256-bit. +/// This function should only be called with legal types (otherwise, the calls +/// to get simple value types will assert). +static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { + SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); + SmallVector<SDValue, 4> CatOps; + if (Sel.getOpcode() != ISD::VSELECT || + !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) + return SDValue(); + + // Note: We assume simple value types because this should only be called with + // legal operations/types. + // TODO: This can be extended to handle extraction to 256-bits. + MVT VT = Ext->getSimpleValueType(0); + if (!VT.is128BitVector()) + return SDValue(); + + MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); + if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) + return SDValue(); + + MVT WideVT = Ext->getOperand(0).getSimpleValueType(); + MVT SelVT = Sel.getSimpleValueType(); + assert((SelVT.is256BitVector() || SelVT.is512BitVector()) && + "Unexpected vector type with legal operations"); + + unsigned SelElts = SelVT.getVectorNumElements(); + unsigned CastedElts = WideVT.getVectorNumElements(); + unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue(); + if (SelElts % CastedElts == 0) { + // The select has the same or more (narrower) elements than the extract + // operand. The extraction index gets scaled by that factor. + ExtIdx *= (SelElts / CastedElts); + } else if (CastedElts % SelElts == 0) { + // The select has less (wider) elements than the extract operand. Make sure + // that the extraction index can be divided evenly. + unsigned IndexDivisor = CastedElts / SelElts; + if (ExtIdx % IndexDivisor != 0) + return SDValue(); + ExtIdx /= IndexDivisor; + } else { + llvm_unreachable("Element count of simple vector types are not divisible?"); } - return SDValue(); + unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); + unsigned NarrowElts = SelElts / NarrowingFactor; + MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); + SDLoc DL(Ext); + SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); + SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); + SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); + SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); + return DAG.getBitcast(VT, NarrowSel); } static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, @@ -41348,7 +43924,10 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Capture the original wide type in the likely case that we need to bitcast // back to this type. - EVT VT = N->getValueType(0); + if (!N->getValueType(0).isSimple()) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); EVT WideVecVT = N->getOperand(0).getValueType(); SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41374,65 +43953,102 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - MVT OpVT = N->getSimpleValueType(0); + if (SDValue V = narrowExtractedVectorSelect(N, DAG)) + return V; + SDValue InVec = N->getOperand(0); unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) - return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); + return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { - if (OpVT.getScalarType() == MVT::i1) - return DAG.getConstant(1, SDLoc(N), OpVT); - return getOnesVector(OpVT, DAG, SDLoc(N)); + if (VT.getScalarType() == MVT::i1) + return DAG.getConstant(1, SDLoc(N), VT); + return getOnesVector(VT, DAG, SDLoc(N)); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector( - OpVT, SDLoc(N), - InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements())); + VT, SDLoc(N), + InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); + + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR + if (InVec.getOpcode() == ISD::BITCAST && + InVec.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = InVec.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + SrcOp, NewIndex); + return DAG.getBitcast(VT, NewExtract); + } + } + } + } + + // If we're extracting from a broadcast then we're better off just + // broadcasting to the smaller type directly, assuming this is the only use. + // As its a broadcast we don't care about the extraction index. + if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && + InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { - return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); + } + // v2f64 CVTUDQ2PD(v4i32). + if (InOpcode == ISD::UINT_TO_FP && + InVec.getOperand(0).getValueType() == MVT::v4i32) { + return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTPS2PD(v4f32). if (InOpcode == ISD::FP_EXTEND && InVec.getOperand(0).getValueType() == MVT::v4f32) { - return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); + return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); } } - if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && - OpVT.is128BitVector() && - InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = - InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; - return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); - } - if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + if ((InOpcode == ISD::ANY_EXTEND || + InOpcode == ISD::ANY_EXTEND_VECTOR_INREG || + InOpcode == ISD::ZERO_EXTEND || + InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - OpVT.is128BitVector() && + VT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); + return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); } - if (InOpcode == ISD::BITCAST) { - // TODO - do this for target shuffles in general. - SDValue InVecBC = peekThroughOneUseBitcasts(InVec); - if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) { - SDLoc DL(N); - SDValue SubPSHUFB = - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, - extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL), - extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL)); - return DAG.getBitcast(OpVT, SubPSHUFB); - } + if (InOpcode == ISD::VSELECT && + InVec.getOperand(0).getValueType().is256BitVector() && + InVec.getOperand(1).getValueType().is256BitVector() && + InVec.getOperand(2).getValueType().is256BitVector()) { + SDLoc DL(N); + SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); + SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); + SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); + return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } } @@ -41442,6 +44058,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our @@ -41450,7 +44067,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) if (C->getAPIntValue().isOneValue()) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. @@ -41459,8 +44076,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) if (C->isNullValue()) - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - Src.getOperand(0), Src.getOperand(1)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), + Src.getOperand(1)); + + // Reduce v2i64 to v4i32 if we don't need the upper bits. + // TODO: Move to DAGCombine? + if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && + Src.getValueType() == MVT::i64 && Src.hasOneUse() && + Src.getOperand(0).getScalarValueSizeInBits() <= 32) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); return SDValue(); } @@ -41497,6 +44123,56 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Try to merge vector loads and extend_inreg to an extload. + if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && + In.hasOneUse()) { + auto *Ld = cast<LoadSDNode>(In); + if (!Ld->isVolatile()) { + MVT SVT = In.getSimpleValueType().getVectorElementType(); + ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, + VT.getVectorNumElements()); + if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { + SDValue Load = + DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + return Load; + } + } + } + + // Disabling for widening legalization for now. We can enable if we find a + // case that needs it. Otherwise it can be deleted when we switch to + // widening legalization. + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + // Combine (ext_invec (ext_invec X)) -> (ext_invec X) + if (In.getOpcode() == N->getOpcode() && + TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + + // Attempt to combine as a shuffle. + // TODO: SSE41 support + if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { + SDValue Op(N, 0); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -41508,6 +44184,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case ISD::CONCAT_VECTORS: + return combineConcatVectors(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: @@ -41520,19 +44198,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::CMP: return combineCMP(N, DAG); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); + case X86ISD::ADD: + case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); + case ISD::SHL: return combineShiftLeft(N, DAG); + case ISD::SRA: return combineShiftRightArithmetic(N, DAG); + case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); - case ISD::STORE: return combineStore(N, DAG, Subtarget); + case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); @@ -41549,13 +44229,21 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); - case X86ISD::CVTSI2P: + case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); + case X86ISD::CVTP2SI: + case X86ISD::CVTP2UI: + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI, + Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); @@ -41638,11 +44326,15 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; - // 8-bit multiply is probably not much cheaper than 32-bit multiply, and - // we have specializations to turn 32-bit multiply into LEA or other ops. + // TODO: Almost no 8-bit ops are desirable because they have no actual + // size/speed advantages vs. 32-bit ops, but they do have a major + // potential disadvantage by causing partial register stalls. + // + // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and + // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. - if (Opc == ISD::MUL && VT == MVT::i8) + if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, @@ -41656,6 +44348,7 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: + case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: @@ -41731,6 +44424,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { case ISD::ANY_EXTEND: break; case ISD::SHL: + case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). @@ -41903,6 +44597,40 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return false; } +static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { + X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint) + .Case("{@cca}", X86::COND_A) + .Case("{@ccae}", X86::COND_AE) + .Case("{@ccb}", X86::COND_B) + .Case("{@ccbe}", X86::COND_BE) + .Case("{@ccc}", X86::COND_B) + .Case("{@cce}", X86::COND_E) + .Case("{@ccz}", X86::COND_E) + .Case("{@ccg}", X86::COND_G) + .Case("{@ccge}", X86::COND_GE) + .Case("{@ccl}", X86::COND_L) + .Case("{@ccle}", X86::COND_LE) + .Case("{@ccna}", X86::COND_BE) + .Case("{@ccnae}", X86::COND_B) + .Case("{@ccnb}", X86::COND_AE) + .Case("{@ccnbe}", X86::COND_A) + .Case("{@ccnc}", X86::COND_AE) + .Case("{@ccne}", X86::COND_NE) + .Case("{@ccnz}", X86::COND_NE) + .Case("{@ccng}", X86::COND_LE) + .Case("{@ccnge}", X86::COND_L) + .Case("{@ccnl}", X86::COND_GE) + .Case("{@ccnle}", X86::COND_G) + .Case("{@ccno}", X86::COND_NO) + .Case("{@ccnp}", X86::COND_P) + .Case("{@ccns}", X86::COND_NS) + .Case("{@cco}", X86::COND_O) + .Case("{@ccp}", X86::COND_P) + .Case("{@ccs}", X86::COND_S) + .Default(X86::COND_INVALID); + return Cond; +} + /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { @@ -41963,7 +44691,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } } - } + } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return C_Other; return TargetLowering::getConstraintType(Constraint); } @@ -42134,6 +44863,32 @@ LowerXConstraint(EVT ConstraintVT) const { return TargetLowering::LowerXConstraint(ConstraintVT); } +// Lower @cc targets via setcc. +SDValue X86TargetLowering::LowerAsmOutputForConstraint( + SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, + SelectionDAG &DAG) const { + X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); + if (Cond == X86::COND_INVALID) + return SDValue(); + // Check that return type is valid. + if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || + OpInfo.ConstraintVT.getSizeInBits() < 8) + report_fatal_error("Flag output operand is of invalid type"); + + // Get EFLAGS register. Only update chain when copyfrom is glued. + if (Flag.getNode()) { + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag); + Chain = Flag.getValue(1); + } else + Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); + // Extract CC code. + SDValue CC = getSETCC(Cond, Flag, DL, DAG); + // Extend to 32-bits + SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); + + return Result; +} + /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, @@ -42243,8 +44998,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { - // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); + bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; + BooleanContent BCont = getBooleanContents(MVT::i64); + ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) + : ISD::SIGN_EXTEND; + int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() + : CST->getSExtValue(); + Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); break; } @@ -42256,40 +45016,12 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = nullptr; - int64_t Offset = 0; - - // Match either (GA), (GA+C), (GA+C1+C2), etc. - while (1) { - if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { - Offset += GA->getOffset(); - break; - } else if (Op.getOpcode() == ISD::ADD) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - Offset += C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } else if (Op.getOpcode() == ISD::SUB) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - Offset += -C->getZExtValue(); - Op = Op.getOperand(0); - continue; - } - } - - // Otherwise, this isn't something we can handle, reject it. - return; - } - - const GlobalValue *GV = GA->getGlobal(); - // If we require an extra load to get this address, as in PIC mode, we - // can't accept it. - if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) - return; - - Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), - GA->getValueType(0), Offset); + if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference( + Subtarget.classifyGlobalReference(GA->getGlobal()))) + return; break; } } @@ -42321,6 +45053,18 @@ static bool isFRClass(const TargetRegisterClass &RC) { RC.hasSuperClassEq(&X86::VR512RegClass); } +/// Check if \p RC is a mask register class. +/// I.e., VK* or one of their variant. +static bool isVKClass(const TargetRegisterClass &RC) { + return RC.hasSuperClassEq(&X86::VK1RegClass) || + RC.hasSuperClassEq(&X86::VK2RegClass) || + RC.hasSuperClassEq(&X86::VK4RegClass) || + RC.hasSuperClassEq(&X86::VK8RegClass) || + RC.hasSuperClassEq(&X86::VK16RegClass) || + RC.hasSuperClassEq(&X86::VK32RegClass) || + RC.hasSuperClassEq(&X86::VK64RegClass); +} + std::pair<unsigned, const TargetRegisterClass *> X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, @@ -42331,25 +45075,31 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // GCC Constraint Letters switch (Constraint[0]) { default: break; + // 'A' means [ER]AX + [ER]DX. + case 'A': + if (Subtarget.is64Bit()) + return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); + assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && + "Expecting 64, 32 or 16 bit subtarget"); + return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); + // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { - // Only supported in AVX512 or later. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32RegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16RegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8RegClass); - case MVT::i1: + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1RegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16RegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32RegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64RegClass); - } } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. @@ -42417,7 +45167,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Scalar SSE types. case MVT::f32: case MVT::i32: - if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX()) + if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: @@ -42445,12 +45195,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); - return std::make_pair(0U, &X86::VR256RegClass); + if (Subtarget.hasAVX()) + return std::make_pair(0U, &X86::VR256RegClass); + break; case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: - return std::make_pair(0U, &X86::VR512RegClass); + if (!Subtarget.hasAVX512()) break; + if (VConstraint) + return std::make_pair(0U, &X86::VR512RegClass); + return std::make_pair(0U, &X86::VR512_0_15RegClass); } break; } @@ -42471,25 +45226,27 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. - if (Subtarget.hasAVX512()) { // Only supported in AVX512. - switch (VT.SimpleTy) { - default: break; - case MVT::i32: - return std::make_pair(0U, &X86::VK32WMRegClass); - case MVT::i16: - return std::make_pair(0U, &X86::VK16WMRegClass); - case MVT::i8: - return std::make_pair(0U, &X86::VK8WMRegClass); - case MVT::i1: + if (Subtarget.hasAVX512()) { + if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1WMRegClass); - case MVT::i64: + if (VT == MVT::i8) + return std::make_pair(0U, &X86::VK8WMRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::VK16WMRegClass); + } + if (Subtarget.hasBWI()) { + if (VT == MVT::i32) + return std::make_pair(0U, &X86::VK32WMRegClass); + if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64WMRegClass); - } } break; } } + if (parseConstraintCode(Constraint) != X86::COND_INVALID) + return std::make_pair(0U, &X86::GR32RegClass); + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass*> Res; @@ -42527,14 +45284,6 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (StringRef("{fpsr}").equals_lower(Constraint)) return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); - // 'A' means [ER]AX + [ER]DX. - if (Constraint == "A") { - if (Subtarget.is64Bit()) - return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); - assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && - "Expecting 64, 32 or 16 bit subtarget"); - return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - } return Res; } @@ -42583,20 +45332,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. - switch (Res.first) { - case X86::EAX: + switch (DestReg) { + case X86::RAX: return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); - case X86::EDX: + case X86::RDX: return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); - case X86::ECX: + case X86::RCX: return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); - case X86::EBX: + case X86::RBX: return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); - case X86::ESI: + case X86::RSI: return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); - case X86::EDI: + case X86::RDI: return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); - case X86::EBP: + case X86::RBP: return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); default: return std::make_pair(0, nullptr); @@ -42616,13 +45365,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) - Res.second = &X86::FR32RegClass; + Res.second = &X86::FR32XRegClass; else if (VT == MVT::f64 || VT == MVT::i64) - Res.second = &X86::FR64RegClass; - else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) - Res.second = &X86::VR128RegClass; - else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) - Res.second = &X86::VR256RegClass; + Res.second = &X86::FR64XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) + Res.second = &X86::VR128XRegClass; + else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) + Res.second = &X86::VR256XRegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { @@ -42630,6 +45379,22 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.first = 0; Res.second = nullptr; } + } else if (isVKClass(*Class)) { + if (VT == MVT::i1) + Res.second = &X86::VK1RegClass; + else if (VT == MVT::i8) + Res.second = &X86::VK8RegClass; + else if (VT == MVT::i16) + Res.second = &X86::VK16RegClass; + else if (VT == MVT::i32) + Res.second = &X86::VK32RegClass; + else if (VT == MVT::i64) + Res.second = &X86::VK64RegClass; + else { + // Type mismatch and not a clobber: Return an error; + Res.first = 0; + Res.second = nullptr; + } } return Res; @@ -42682,7 +45447,7 @@ void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = - Entry->getParent()->getInfo<X86MachineFunctionInfo>(); + Entry->getParent()->getInfo<X86MachineFunctionInfo>(); AFI->setIsSplitCSR(true); } @@ -42710,9 +45475,9 @@ void X86TargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction().hasFnAttribute( - Attribute::NoUnwind) && - "Function should be nounwind in insertCopiesSplitCSR!"); + assert( + Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); @@ -42731,7 +45496,8 @@ bool X86TargetLowering::supportSwiftError() const { /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. -StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { +StringRef +X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 66d5d43946a2..e0be03bc3f9d 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -1,9 +1,8 @@ //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -78,15 +77,6 @@ namespace llvm { /// Same as call except it adds the NoTrack prefix. NT_CALL, - /// This operation implements the lowering for readcyclecounter. - RDTSC_DAG, - - /// X86 Read Time-Stamp Counter and Processor ID. - RDTSCP_DAG, - - /// X86 Read Performance Monitoring Counters. - RDPMC_DAG, - /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -110,13 +100,12 @@ namespace llvm { FSETCC, /// X86 FP SETCC, similar to above, but with output as an i1 mask and - /// with optional rounding mode. - FSETCCM, FSETCCM_RND, + /// and a version with SAE. + FSETCCM, FSETCCM_SAE, /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. It also writes a - /// flag result. + /// flag operand produced by a CMP or TEST instruction. CMOV, /// X86 conditional branches. Operand 0 is the chain operand, operand 1 @@ -204,28 +193,29 @@ namespace llvm { /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This - /// can also be used to implement the intrinsics. + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE BLENDV, /// Combined add and sub on an FP vector. ADDSUB, // FP vector ops with rounding mode. - FADD_RND, FADDS_RND, - FSUB_RND, FSUBS_RND, - FMUL_RND, FMULS_RND, - FDIV_RND, FDIVS_RND, - FMAX_RND, FMAXS_RND, - FMIN_RND, FMINS_RND, - FSQRT_RND, FSQRTS_RND, + FADD_RND, FADDS, FADDS_RND, + FSUB_RND, FSUBS, FSUBS_RND, + FMUL_RND, FMULS, FMULS_RND, + FDIV_RND, FDIVS, FDIVS_RND, + FMAX_SAE, FMAXS_SAE, + FMIN_SAE, FMINS_SAE, + FSQRT_RND, FSQRTS, FSQRTS_RND, // FP vector get exponent. - FGETEXP_RND, FGETEXPS_RND, + FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE, // Extract Normalized Mantissas. - VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND, + VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE, // FP Scale. - SCALEF, - SCALEFS, + SCALEF, SCALEF_RND, + SCALEFS, SCALEFS_RND, // Unsigned Integer average. AVG, @@ -300,10 +290,10 @@ namespace llvm { VMTRUNC, VMTRUNCUS, VMTRUNCS, // Vector FP extend. - VFPEXT, VFPEXT_RND, VFPEXTS_RND, + VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE, // Vector FP round. - VFPROUND, VFPROUND_RND, VFPROUNDS_RND, + VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, // Masked version of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK @@ -341,8 +331,8 @@ namespace llvm { /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, - // Vector comparison with rounding mode for FP values - CMPM_RND, + // Vector comparison with SAE for FP values + CMPM_SAE, // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, UMUL, @@ -417,16 +407,16 @@ namespace llvm { // Bitwise ternary logic. VPTERNLOG, // Fix Up Special Packed Float32/64 values. - VFIXUPIMM, - VFIXUPIMMS, + VFIXUPIMM, VFIXUPIMM_SAE, + VFIXUPIMMS, VFIXUPIMMS_SAE, // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, VRANGE_RND, VRANGES, VRANGES_RND, + VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE, // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND, + VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE, // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. // Also used by the legacy (V)ROUND intrinsics where we mask out the // scaling part of the immediate. - VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND, + VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE, // Tests Types Of a FP Values for packed types. VFPCLASS, // Tests Types Of a FP Values for scalar types. @@ -497,6 +487,7 @@ namespace llvm { // Convert Unsigned/Integer to Floating-Point Value with rounding mode. SINT_TO_FP_RND, UINT_TO_FP_RND, + SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP, SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, // Vector float/double to signed/unsigned integer. @@ -505,9 +496,9 @@ namespace llvm { CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND, // Vector float/double to signed/unsigned integer with truncation. - CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND, + CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE, // Scalar float/double to signed/unsigned integer with truncation. - CVTTS2SI, CVTTS2UI, CVTTS2SI_RND, CVTTS2UI_RND, + CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE, // Vector signed/unsigned integer to float/double. CVTSI2P, CVTUI2P, @@ -515,6 +506,20 @@ namespace llvm { // Masked versions of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, + MCVTSI2P, MCVTUI2P, + + // Vector float to bfloat16. + // Convert TWO packed single data to one packed BF16 data + CVTNE2PS2BF16, + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16 pairs to accumulated into + // packed single precision. + DPBF16PS, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. @@ -545,6 +550,12 @@ namespace llvm { // indicate whether it is valid in CF. RDSEED, + // Protection keys + // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. + // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is + // value for ECX. + RDPKRU, WRPKRU, + // SSE42 string comparisons. // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG // will emit one or two instructions based on which results are used. If @@ -558,10 +569,11 @@ namespace llvm { XTEST, // ERI instructions. - RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2, + RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE, + RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE, // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, + CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE, // Masked version of above. // SRC, RND, PASSTHRU, MASK @@ -576,6 +588,12 @@ namespace llvm { // User level wait UMWAIT, TPAUSE, + // Enqueue Stores Instructions + ENQCMD, ENQCMDS, + + // For avx512-vp2intersect + VP2INTERSECT, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, @@ -590,6 +608,9 @@ namespace llvm { // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, + // extract_vector_elt, store. + VEXTRACT_STORE, + // Store FP control world into i16 memory. FNSTCW16m, @@ -597,29 +618,33 @@ namespace llvm { /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It /// has two inputs (token chain and address) and two outputs (int value - /// and token chain). - FP_TO_INT16_IN_MEM, - FP_TO_INT32_IN_MEM, - FP_TO_INT64_IN_MEM, + /// and token chain). Memory VT specifies the type to store to. + FP_TO_INT_IN_MEM, /// This instruction implements SINT_TO_FP with the /// integer source in memory and FP reg result. This corresponds to the - /// X86::FILD*m instructions. It has three inputs (token chain, address, - /// and source type) and two outputs (FP value and token chain). FILD_FLAG - /// also produces a flag). + /// X86::FILD*m instructions. It has two inputs (token chain and address) + /// and two outputs (FP value and token chain). FILD_FLAG also produces a + /// flag). The integer source type is specified by the memory VT. FILD, FILD_FLAG, + /// This instruction implements a fp->int store from FP stack + /// slots. This corresponds to the fist instruction. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FIST, + /// This instruction implements an extending load to FP stack slots. /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain - /// operand, ptr to load from, and a ValueType node indicating the type - /// to load to. + /// operand, and ptr to load from. The memory VT specifies the type to + /// load from. FLD, - /// This instruction implements a truncating store to FP stack + /// This instruction implements a truncating store from FP stack /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a - /// chain operand, value to store, address, and a ValueType to store it - /// as. + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. FST, /// This instruction grabs the address of the next argument @@ -706,7 +731,7 @@ namespace llvm { /// target-independent logic. EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; /// Returns true if it's safe to use load / store of the /// specified type to expand memcpy / memset inline. This is mostly true @@ -719,7 +744,8 @@ namespace llvm { /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, - bool *Fast) const override; + MachineMemOperand::Flags Flags, + bool *Fast) const override; /// Provide custom lowering hooks for some operations. /// @@ -773,7 +799,11 @@ namespace llvm { /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - bool mergeStoresAfterLegalization() const override { return true; } + /// Do not merge vector stores after legalization because that may conflict + /// with x86-specific store splitting optimizations. + bool mergeStoresAfterLegalization(EVT MemVT) const override { + return !MemVT.isVector(); + } bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override; @@ -810,7 +840,10 @@ namespace llvm { bool hasAndNot(SDValue Y) const override; - bool preferShiftsToClearExtremeBits(SDValue Y) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; + + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, @@ -830,6 +863,12 @@ namespace llvm { return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return false; + return true; + } + bool shouldSplatInsEltVarIndex(EVT VT) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { @@ -839,11 +878,6 @@ namespace llvm { /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; - /// Allow multiple load pairs per block for smaller and faster code. - unsigned getMemcmpEqZeroLoadsPerBlock() const override { - return 2; - } - /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -879,6 +913,8 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; + SDValue unwrapAddress(SDValue N) const override; SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; @@ -916,6 +952,11 @@ namespace llvm { return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } + /// Handle Lowering flag assembly outputs. + SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL, + const AsmOperandInfo &Constraint, + SelectionDAG &DAG) const override; + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On @@ -954,6 +995,12 @@ namespace llvm { bool isVectorShiftByScalarCheap(Type *Ty) const override; + /// Add x86-specific opcodes to the default list. + bool isBinOp(unsigned Opcode) const override; + + /// Returns true if the opcode is a commutative binary operation. + bool isCommutativeBinOp(unsigned Opcode) const override; + /// Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. @@ -999,7 +1046,8 @@ namespace llvm { /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a @@ -1061,6 +1109,17 @@ namespace llvm { /// supported. bool shouldScalarizeBinop(SDValue) const override; + /// Extract of a scalar FP value from index 0 of a vector is free. + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; + } + + /// Overflow nodes should get combined/lowered to optimal instructions + /// (they should allow eliminating explicit compares by getting flags from + /// math ops). + bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override; + bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const override { // If we can replace more than 2 scalar stores, there will be a reduction @@ -1068,7 +1127,9 @@ namespace llvm { return NumElem > 2; } - bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override; + bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const override; /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { @@ -1103,7 +1164,7 @@ namespace llvm { bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; - Value *getSSPStackGuardCheck(const Module &M) const override; + Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; @@ -1219,9 +1280,7 @@ namespace llvm { unsigned getAddressSpace(void) const; - std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool isSigned, - bool isReplace) const; + SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -1232,12 +1291,15 @@ namespace llvm { const unsigned char OpFlags = 0) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl, - int64_t Offset, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + /// Creates target global address or external symbol nodes for calls or + /// other uses. + SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, + bool ForCall) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; @@ -1566,10 +1628,10 @@ namespace llvm { void scaleShuffleMask(int Scale, ArrayRef<T> Mask, SmallVectorImpl<T> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); - int NumElts = Mask.size(); - ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); + size_t NumElts = Mask.size(); + ScaledMask.assign(NumElts * Scale, -1); - for (int i = 0; i != NumElts; ++i) { + for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; // Repeat sentinel values in every mask element. diff --git a/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 7c00c9260d15..04e8b2231fec 100644 --- a/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/contrib/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -1,9 +1,8 @@ //===---- X86IndirectBranchTracking.cpp - Enables CET IBT mechanism -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -58,7 +57,7 @@ private: /// The function will not add it if already exists. /// It will add ENDBR32 or ENDBR64 opcode, depending on the target. /// \returns true if the ENDBR was added and false otherwise. - bool addENDBR(MachineBasicBlock &MBB) const; + bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; }; } // end anonymous namespace @@ -69,20 +68,31 @@ FunctionPass *llvm::createX86IndirectBranchTrackingPass() { return new X86IndirectBranchTrackingPass(); } -bool X86IndirectBranchTrackingPass::addENDBR(MachineBasicBlock &MBB) const { +bool X86IndirectBranchTrackingPass::addENDBR( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { assert(TII && "Target instruction info was not initialized"); assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) && "Unexpected Endbr opcode"); - auto MI = MBB.begin(); - // If the MBB is empty or the first instruction is not ENDBR, - // add the ENDBR instruction to the beginning of the MBB. - if (MI == MBB.end() || EndbrOpcode != MI->getOpcode()) { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(EndbrOpcode)); - NumEndBranchAdded++; + // If the MBB/I is empty or the current instruction is not ENDBR, + // insert ENDBR instruction to the location of I. + if (I == MBB.end() || I->getOpcode() != EndbrOpcode) { + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(EndbrOpcode)); + ++NumEndBranchAdded; return true; } + return false; +} +bool IsCallReturnTwice(llvm::MachineOperand &MOp) { + if (!MOp.isGlobal()) + return false; + auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal()); + if (!CalleeFn) + return false; + AttributeList Attrs = CalleeFn->getAttributes(); + if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice)) + return true; return false; } @@ -108,14 +118,21 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { !MF.getFunction().hasLocalLinkage()) && !MF.getFunction().doesNoCfCheck()) { auto MBB = MF.begin(); - Changed |= addENDBR(*MBB); + Changed |= addENDBR(*MBB, MBB->begin()); } - for (auto &MBB : MF) + for (auto &MBB : MF) { // Find all basic blocks that their address was taken (for example // in the case of indirect jump) and add ENDBR instruction. if (MBB.hasAddressTaken()) - Changed |= addENDBR(MBB); - + Changed |= addENDBR(MBB, MBB.begin()); + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (!I->isCall()) + continue; + if (IsCallReturnTwice(I->getOperand(0))) + Changed |= addENDBR(MBB, std::next(I)); + } + } return Changed; } diff --git a/contrib/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm/lib/Target/X86/X86InsertPrefetch.cpp index 8bd57aa2278b..02ae73706a34 100644 --- a/contrib/llvm/lib/Target/X86/X86InsertPrefetch.cpp +++ b/contrib/llvm/lib/Target/X86/X86InsertPrefetch.cpp @@ -1,9 +1,8 @@ //===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td index 49e9e924887a..cd1b06365971 100644 --- a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td +++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td @@ -1,9 +1,8 @@ //===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -74,7 +73,9 @@ defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>; defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>; defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>; -let SchedRW = [WriteEMMS] in +let SchedRW = [WriteEMMS], + Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>, TB; diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index 85676f102be0..54eddeacaa17 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1,9 +1,8 @@ //===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,6 +26,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, // Corresponding mask register class. RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts); + // Corresponding mask register pair class. + RegisterOperand KRPC = !if (!gt(NumElts, 16), ?, + !cast<RegisterOperand>("VK" # NumElts # "Pair")); + // Corresponding write-mask register class. RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); @@ -95,10 +98,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); - // A vector type of the same width with element type i32. This is used to - // create the canonical constant zero node ImmAllZerosV. - ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); - dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + dag ImmAllZerosV = (VT immAllZerosV); string ZSuffix = !if (!eq (Size, 128), "Z128", !if (!eq (Size, 256), "Z256", "Z")); @@ -277,10 +277,9 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, - bit IsCommutable = 0> : + dag RHS> : AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm, - RHS, IsCommutable, 0, IsCommutable, X86selects>; + RHS, 0, 0, 0, X86selects>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -365,7 +364,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, list<dag> Pattern, list<dag> MaskingPattern, bit IsCommutable = 0> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable in { def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# "$dst, "#IntelSrcAsm#"}", @@ -375,6 +374,7 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", MaskingPattern>, EVEX_K; + } } multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, @@ -392,38 +392,11 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0> : + dag RHS, dag RHS_su, bit IsCommutable = 0> : AVX512_maskable_common_cmp<O, F, _, Outs, Ins, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (and _.KRCWM:$mask, RHS), IsCommutable>; - -multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, - dag Outs, dag Ins, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm> : - AVX512_maskable_custom_cmp<O, F, Outs, - Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, - AttSrcAsm, IntelSrcAsm, [], []>; - -// This multiclass generates the unconditional/non-masking, the masking and -// the zero-masking variant of the vector instruction. In the masking case, the -// perserved vector elements come from a new dummy input operand tied to $dst. -multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _, - dag Outs, dag Ins, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskedRHS, - bit IsCommutable = 0, SDNode Select = vselect> : - AVX512_maskable_custom<O, F, Outs, Ins, - !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), - !con((ins _.KRCWM:$mask), Ins), - OpcodeStr, AttSrcAsm, IntelSrcAsm, - [(set _.RC:$dst, RHS)], - [(set _.RC:$dst, - (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))], - [(set _.RC:$dst, - (Select _.KRCWM:$mask, MaskedRHS, - _.ImmAllZerosV))], - "$src0 = $dst", IsCommutable>; + (and _.KRCWM:$mask, RHS_su), IsCommutable>; // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. @@ -451,8 +424,8 @@ def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), (ins VK8WM:$mask), "", [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), - (bc_v8i64 (v16i32 immAllOnesV)), - (bc_v8i64 (v16i32 immAllZerosV))))]>; + (v8i64 immAllOnesV), + (v8i64 immAllZerosV)))]>; } let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -753,6 +726,7 @@ defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info, // vinsertps - insert f32 to XMM let ExeDomain = SSEPackedSingle in { +let isCommutable = 1 in def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -1378,15 +1352,15 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), + def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; } let Predicates = [HasVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ128m addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { @@ -1397,12 +1371,30 @@ let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ128m addr:$src)>; def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; } +let Predicates = [HasBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; + def : Pat<(v32i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZm addr:$src)>; +} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST SUBVECTORS @@ -1464,7 +1456,7 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))), // Patterns for selects of bitcasted operations. def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (bc_v16f32 (v16i32 immAllZerosV))), + (v16f32 immAllZerosV)), (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), @@ -1481,7 +1473,7 @@ def : Pat<(vselect VK16WM:$mask, def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), - (bc_v8f64 (v16i32 immAllZerosV))), + (v8f64 immAllZerosV)), (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), @@ -1489,7 +1481,7 @@ def : Pat<(vselect VK8WM:$mask, (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), - (bc_v8i64 (v16i32 immAllZerosV))), + (v8i64 immAllZerosV)), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), @@ -1517,7 +1509,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), // Patterns for selects of bitcasted operations. def : Pat<(vselect VK8WM:$mask, (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (bc_v8f32 (v8i32 immAllZerosV))), + (v8f32 immAllZerosV)), (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), @@ -1566,7 +1558,7 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2" // Patterns for selects of bitcasted operations. def : Pat<(vselect VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (bc_v4f64 (v8i32 immAllZerosV))), + (v4f64 immAllZerosV)), (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), @@ -1574,7 +1566,7 @@ def : Pat<(vselect VK4WM:$mask, (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (bc_v4i64 (v8i32 immAllZerosV))), + (v4i64 immAllZerosV)), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), @@ -1599,7 +1591,7 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", // Patterns for selects of bitcasted operations. def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), - (bc_v16f32 (v16i32 immAllZerosV))), + (v16f32 immAllZerosV)), (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), @@ -1616,7 +1608,7 @@ def : Pat<(vselect VK16WM:$mask, def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (bc_v8f64 (v16i32 immAllZerosV))), + (v8f64 immAllZerosV)), (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), @@ -1624,7 +1616,7 @@ def : Pat<(vselect VK8WM:$mask, (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (bc_v8i64 (v16i32 immAllZerosV))), + (v8i64 immAllZerosV)), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), @@ -2031,96 +2023,86 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend, // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd, +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, + PatFrag OpNode_su, PatFrag OpNodeSAE_su, X86FoldableSchedWrite sched> { defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V, Sched<[sched]>; + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", + (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + imm:$cc), + (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (OpNodeRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc, - (i32 FROUND_NO_EXC))>, - EVEX_4V, EVEX_B, Sched<[sched]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs VK1:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V, - Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, - Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - - defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, - EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable; - }// let isAsmParserOnly = 1, hasSideEffects = 0 + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc), + (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, + EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { let isCommutable = 1 in def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, u8imm:$cc), + !strconcat("vcmp", _.Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, imm:$cc))]>, - EVEX_4V, Sched<[sched]>; + EVEX_4V, VEX_LIG, Sched<[sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + !strconcat("vcmp", _.Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), imm:$cc))]>, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } +def X86cmpms_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpms node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; +def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + let Predicates = [HasAVX512] in { let ExeDomain = SSEPackedSingle in - defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd, + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsSAE, + X86cmpms_su, X86cmpmsSAE_su, SchedWriteFCmp.Scl>, AVX512XSIi8Base; let ExeDomain = SSEPackedDouble in - defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd, + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsSAE, + X86cmpms_su, X86cmpmsSAE_su, SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _, - bit IsCommutable> { + PatFrag OpNode_su, X86FoldableSchedWrite sched, + X86VectorVTInfo _, bit IsCommutable> { let isCommutable = IsCommutable in def rr : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), @@ -2139,22 +2121,23 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (OpNode_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)))))]>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, + PatFrag OpNode_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> { + avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> { def rmb : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", @@ -2169,7 +2152,7 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (OpNode_su (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, EVEX_4V, EVEX_K, EVEX_B, @@ -2177,33 +2160,34 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, } multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, + PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM, + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM, + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM, + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched, + PatFrag OpNode, PatFrag OpNode_su, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM, + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM, + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM, + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } @@ -2216,59 +2200,69 @@ def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; +def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpeqm_c node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; +def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpgtm node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su, + X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { let isCommutable = 1 in def rri : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), (_.VT _.RC:$src2), cond)))]>, EVEX_4V, Sched<[sched]>; def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), @@ -2278,67 +2272,36 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, let isCommutable = 1 in def rrik : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{$src2, $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, $src2}"), + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - cond))))]>, + (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmik : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{$src2, $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, $src2}"), + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, (_.KVT - (Frag:$cc + (Frag_su:$cc (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), cond))))]>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", - "$dst, $src1, $src2, $cc}"), []>, - EVEX_4V, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmi_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), - !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", - "$dst, $src1, $src2, $cc}"), []>, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - def rrik_alt : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - u8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, $src2, $cc}"), []>, - EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmik_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - u8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, $src2, $cc}"), []>, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), + (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2346,15 +2309,17 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> : - avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> { + avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched, _, Name> { def rmib : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", - "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), (X86VBroadcast @@ -2363,45 +2328,25 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, AVX512ICC:$cc), - !strconcat("vpcmp${cc}", Suffix, - "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + _.ScalarMemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (_.KVT (Frag:$cc + (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), cond))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { - def rmib_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - u8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", - "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, u8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", - "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>, - EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), (_.VT _.RC:$src1), cond)), (!cast<Instruction>(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (X86VBroadcast + (_.KVT (CommFrag_su:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), (_.VT _.RC:$src1), cond))), (!cast<Instruction>(Name#_.ZSuffix#"rmibk") @@ -2410,32 +2355,34 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM, - VTInfo.info512, NAME>, EVEX_V512; + defm Z : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.ZMM, VTInfo.info512, NAME>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM, - VTInfo.info256, NAME>, EVEX_V256; - defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM, - VTInfo.info128, NAME>, EVEX_V128; + defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.YMM, VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.XMM, VTInfo.info128, NAME>, EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM, - VTInfo.info512, NAME>, EVEX_V512; + defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.ZMM, VTInfo.info512, NAME>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM, - VTInfo.info256, NAME>, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM, - VTInfo.info128, NAME>, EVEX_V128; + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.YMM, VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, Frag_su, CommFrag, CommFrag_su, + sched.XMM, VTInfo.info128, NAME>, EVEX_V128; } } @@ -2459,6 +2406,12 @@ def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2466,12 +2419,24 @@ def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2479,93 +2444,91 @@ def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + // FIXME: Is there a better scheduler class for VPCMP/VPCMPU? -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute, +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute, +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute, +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute, +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute, +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute, +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute, +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute, +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpm node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; +def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc), 1>, - Sched<[sched]>; + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + 1>, Sched<[sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "$src2, $src1", "$src1, $src2", - (X86cmpm (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2)), - imm:$cc)>, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc", + (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $cc", (X86cmpm (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc)>, + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - Sched<[sched]>, NotMemoryFoldable; - - let mayLoad = 1 in { - defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - - defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $cc">, - EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } - } // Patterns for selecting with loads in other operand. def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), @@ -2573,9 +2536,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2585,10 +2548,10 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2597,24 +2560,14 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { // comparison code form (VCMP[EQ/LT/LE/...] defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), - "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (X86cmpmRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc, - (i32 FROUND_NO_EXC))>, + (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $cc", + (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_B, Sched<[sched]>; - - let isAsmParserOnly = 1, hasSideEffects = 0 in { - defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, - (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1", - "$src1, $src2, {sae}, $cc">, - EVEX_B, Sched<[sched]>, NotMemoryFoldable; - } } multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { @@ -2647,16 +2600,27 @@ let Predicates = [HasAVX512] in { // ---------------------------------------------------------------- // FPClass + +def X86Vfpclasss_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vfpclasss node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + +def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vfpclass node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + //handle fpclass instruction mask = op(reg_scalar,imm) // op(mem_scalar,imm) -multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, Predicate prd> { let Predicates = [prd], ExeDomain = _.ExeDomain in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1), (i32 imm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), @@ -2664,7 +2628,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (X86Vfpclasss_su (_.VT _.RC:$src1), (i32 imm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), @@ -2672,15 +2636,15 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, - (OpNode _.ScalarIntMemCPat:$src1, - (i32 imm:$src2)))]>, + (X86Vfpclasss _.ScalarIntMemCPat:$src1, + (i32 imm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (OpNode _.ScalarIntMemCPat:$src1, + (X86Vfpclasss_su _.ScalarIntMemCPat:$src1, (i32 imm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2689,14 +2653,14 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, //handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) // fpclass(reg_vec, mem_vec, imm) // fpclass(reg_vec, broadcast(eltVt), imm) -multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, - string mem, string broadcast>{ + string mem>{ let ExeDomain = _.ExeDomain in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1), (i32 imm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), @@ -2704,85 +2668,103 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (X86Vfpclass_su (_.VT _.RC:$src1), (i32 imm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##mem# + OpcodeStr##_.Suffix#"{"#mem#"}"# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst,(OpNode + [(set _.KRC:$dst,(X86Vfpclass (_.VT (_.LdFrag addr:$src1)), (i32 imm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##mem# + OpcodeStr##_.Suffix#"{"#mem#"}"# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode + [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (_.LdFrag addr:$src1)), (i32 imm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + OpcodeStr##_.Suffix##"\t{$src2, ${src1}"## _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", - [(set _.KRC:$dst,(OpNode + [(set _.KRC:$dst,(X86Vfpclass (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src1))), (i32 imm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + OpcodeStr##_.Suffix##"\t{$src2, ${src1}"## _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## _.BroadcastStr##", $src2}", - [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode + [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src1))), (i32 imm:$src2))))]>, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } + + // Allow registers or broadcast with the x, y, z suffix we use to disambiguate + // the memory form. + def : InstAlias<OpcodeStr#_.Suffix#mem# + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(NAME#"rr") + _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias<OpcodeStr#_.Suffix#mem# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + (!cast<Instruction>(NAME#"rrk") + _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias<OpcodeStr#_.Suffix#mem# + "\t{$src2, ${src1}"#_.BroadcastStr#", $dst|$dst, ${src1}"# + _.BroadcastStr#", $src2}", + (!cast<Instruction>(NAME#"rmb") + _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">; + def : InstAlias<OpcodeStr#_.Suffix#mem# + "\t{$src2, ${src1}"#_.BroadcastStr#", $dst {${mask}}|" + "$dst {${mask}}, ${src1}"#_.BroadcastStr#", $src2}", + (!cast<Instruction>(NAME#"rmbk") + _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">; } multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _, - bits<8> opc, SDNode OpNode, - X86SchedWriteWidths sched, Predicate prd, - string broadcast>{ + bits<8> opc, X86SchedWriteWidths sched, + Predicate prd>{ let Predicates = [prd] in { - defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM, - _.info512, "{z}", broadcast>, EVEX_V512; + defm Z : avx512_vector_fpclass<opc, OpcodeStr, sched.ZMM, + _.info512, "z">, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM, - _.info128, "{x}", broadcast>, EVEX_V128; - defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM, - _.info256, "{y}", broadcast>, EVEX_V256; + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, sched.XMM, + _.info128, "x">, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, sched.YMM, + _.info256, "y">, EVEX_V256; } } multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, - bits<8> opcScalar, SDNode VecOpNode, - SDNode ScalarOpNode, X86SchedWriteWidths sched, + bits<8> opcScalar, X86SchedWriteWidths sched, Predicate prd> { defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, - VecOpNode, sched, prd, "{l}">, + sched, prd>, EVEX_CD8<32, CD8VF>; defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, - VecOpNode, sched, prd, "{q}">, + sched, prd>, EVEX_CD8<64, CD8VF> , VEX_W; - defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, - sched.Scl, f32x_info, prd>, + defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, + sched.Scl, f32x_info, prd>, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, - sched.Scl, f64x_info, prd>, + defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, + sched.Scl, f64x_info, prd>, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; } -defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, - X86Vfpclasss, SchedWriteFCmp, HasDQI>, - AVX512AIi8Base, EVEX; +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp, + HasDQI>, AVX512AIi8Base, EVEX; //----------------------------------------------------------------- // Mask register copy, including @@ -3039,26 +3021,24 @@ defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>; defm : avx512_binop_pat<xor, xor, KXORWrr>; // Mask unpacking -multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, - RegisterClass KRCSrc, X86FoldableSchedWrite sched, +multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst, + X86KVectorVTInfo Src, X86FoldableSchedWrite sched, Predicate prd> { let Predicates = [prd] in { let hasSideEffects = 0 in - def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), - (ins KRC:$src1, KRC:$src2), + def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst), + (ins Src.KRC:$src1, Src.KRC:$src2), "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX_4V, VEX_L, Sched<[sched]>; - def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), - (!cast<Instruction>(NAME##rr) - (COPY_TO_REGCLASS KRCSrc:$src2, KRC), - (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)), + (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>; } } -defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD; -defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS; -defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W; +defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info, WriteShuffle, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, @@ -3118,7 +3098,8 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr, +multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, + string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), @@ -3130,8 +3111,8 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr, Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), + (Frag_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2)))), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrk") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), @@ -3141,7 +3122,7 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr, } // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, +multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { @@ -3154,9 +3135,9 @@ def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), (Frag.OperandTransform $cc)), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), - cond)))), + (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), + cond)))), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3165,7 +3146,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, } // Same as above, but for fp types which don't use PatFrags. -multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr, +multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su, + string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), @@ -3177,8 +3159,8 @@ def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), imm:$cc), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (OpNode (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc))), + (OpNode_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), imm:$cc))), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3190,65 +3172,65 @@ let Predicates = [HasAVX512, NoVLX] in { // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>; } - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>; } let Predicates = [HasBWI, NoVLX] in { // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>; } - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>; - defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUW", v8i16x_info, v32i16_info>; } // Mask setting all 0s or 1s @@ -3394,15 +3376,15 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoRMPattern = 0> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, - _.info512.AlignedLdFrag, masked_load_aligned512, + _.info512.AlignedLdFrag, masked_load_aligned, Sched.ZMM, "", NoRMPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, - _.info256.AlignedLdFrag, masked_load_aligned256, + _.info256.AlignedLdFrag, masked_load_aligned, Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, - _.info128.AlignedLdFrag, masked_load_aligned128, + _.info128.AlignedLdFrag, masked_load_aligned, Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128; } } @@ -3414,15 +3396,15 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag, - masked_load_unaligned, Sched.ZMM, "", + masked_load, Sched.ZMM, "", NoRMPattern, SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag, - masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y", + masked_load, Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern, SelectOprr>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag, - masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd, + masked_load, Sched.XMM, EVEX2VEXOvrd, NoRMPattern, SelectOprr>, EVEX_V128; } } @@ -3488,14 +3470,14 @@ multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store, - masked_store_unaligned, Sched.ZMM, "", + masked_store, Sched.ZMM, "", NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store, - masked_store_unaligned, Sched.YMM, + masked_store, Sched.YMM, EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store, - masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd, + masked_store, Sched.XMM, EVEX2VEXOvrd, NoMRPattern>, EVEX_V128; } } @@ -3506,15 +3488,15 @@ multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, string EVEX2VEXOvrd, bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore, - masked_store_aligned512, Sched.ZMM, "", + masked_store_aligned, Sched.ZMM, "", NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore, - masked_store_aligned256, Sched.YMM, + masked_store_aligned, Sched.YMM, EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore, - masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd, + masked_store_aligned, Sched.XMM, EVEX2VEXOvrd, NoMRPattern>, EVEX_V128; } } @@ -3609,7 +3591,7 @@ def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), "", []>, Sched<[WriteFStoreY]>; } -def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 immAllZerosV), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), VK8), VR512:$src)>; @@ -3621,7 +3603,7 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), // These patterns exist to prevent the above patterns from introducing a second // mask inversion when one already exists. def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)), - (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 immAllZerosV), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>; def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), @@ -3761,75 +3743,6 @@ let Predicates = [HasVLX] in { (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } -multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From, - X86VectorVTInfo To, X86VectorVTInfo Cast> { - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - To.RC:$src0)), - (Cast.VT (!cast<Instruction>(InstrStr#"rrk") - Cast.RC:$src0, Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; - - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - Cast.ImmAllZerosV)), - (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") - Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; -} - - -let Predicates = [HasVLX] in { -// A masked extract from the first 128-bits of a 256-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 128-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 256-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>; -} - // Move Int Doubleword to Packed Double Int // let ExeDomain = SSEPackedInt in { @@ -3858,19 +3771,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src) "vmovq\t{$src, $dst|$dst, $src}", [(set FR64X:$dst, (bitconvert GR64:$src))]>, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; -def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>, - EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>; def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64X:$src))]>, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; -def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>, - EVEX, VEX_W, Sched<[WriteVecStore]>, - EVEX_CD8<64, CD8VT1>; } } // ExeDomain = SSEPackedInt @@ -3881,11 +3785,6 @@ def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src) "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))]>, EVEX, Sched<[WriteVecMoveFromGpr]>; - -def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), - "vmovd\t{$src, $dst|$dst, $src}", - [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>, - EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move doubleword from xmm register to r/m32 @@ -3938,6 +3837,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; +let Predicates = [HasAVX512] in { + def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst), + (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>; +} + // Move Scalar Single to Double Int // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { @@ -3946,11 +3850,6 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))]>, EVEX, Sched<[WriteVecMoveToGpr]>; -def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), - (ins i32mem:$dst, FR32X:$src), - "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move Quadword Int to Packed Quadword Int @@ -3974,7 +3873,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar<string asm, SDNode OpNode, +multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag, X86VectorVTInfo _> { let Predicates = [HasAVX512, OptForSize] in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), @@ -3999,11 +3898,18 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, (_.VT (OpNode _.RC:$src1, _.RC:$src2)), (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>; - let canFoldAsLoad = 1, isReMaterializable = 1 in - def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + let canFoldAsLoad = 1, isReMaterializable = 1 in { + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))], _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; + // _alt version uses FR32/FR64 register class. + let isCodeGenOnly = 1 in + def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; + } let mayLoad = 1, hasSideEffects = 0 in { let Constraints = "$src0 = $dst" in def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), @@ -4023,16 +3929,16 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, EVEX, Sched<[WriteFStore]>; let mayStore = 1, hasSideEffects = 0 in def mrk: AVX512PI<0x11, MRMDestMem, (outs), - (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.RC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>, NotMemoryFoldable; } -defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>, VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -4070,7 +3976,7 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask), (!cast<Instruction>(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } @@ -4085,7 +3991,7 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask), (!cast<Instruction>(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } @@ -4105,13 +4011,13 @@ def : Pat<(masked_store (iPTR 0))), addr:$dst, Mask512), (!cast<Instruction>(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; // AVX512VL pattern. def : Pat<(masked_store (_.info128.VT _.info128.RC:$src), addr:$dst, Mask128), (!cast<Instruction>(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), - (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + _.info128.RC:$src)>; } multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, @@ -4119,8 +4025,7 @@ multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask, - (_.info512.VT (bitconvert - (v16i32 immAllZerosV))))), + _.info512.ImmAllZerosV)), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), @@ -4145,8 +4050,7 @@ multiclass avx512_load_scalar_lowering_subreg<string InstrStr, def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask, - (_.info512.VT (bitconvert - (v16i32 immAllZerosV))))), + _.info512.ImmAllZerosV)), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), @@ -4175,8 +4079,7 @@ multiclass avx512_load_scalar_lowering_subreg2<string InstrStr, // AVX512F patterns. def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask512, - (_.info512.VT (bitconvert - (v16i32 immAllZerosV))))), + _.info512.ImmAllZerosV)), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), @@ -4194,7 +4097,7 @@ def : Pat<(_.info128.VT (extract_subvector // AVX512Vl patterns. def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128, - (_.info128.VT (bitconvert (v4i32 immAllZerosV))))), + _.info128.ImmAllZerosV)), (!cast<Instruction>(InstrStr#rmkz) (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; @@ -4383,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in { (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4400,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; - } // Use 128-bit blends for OptForSpeed since BLENDs have better throughput than @@ -4426,79 +4309,27 @@ let Predicates = [HasAVX512, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } let Predicates = [HasAVX512] in { - - // MOVSSrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; - - // MOVSDrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (VMOVSSZrm addr:$src)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (VMOVSDZrm addr:$src)>; // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (X86vzload addr:$src)), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; // Represent the same patterns above but in the form they appear for // 512-bit types - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + def : Pat<(v16f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v16f32 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzload addr:$src)), + def : Pat<(v8f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), - addr:$dst), - (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; } let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { @@ -4517,47 +4348,47 @@ let Predicates = [HasAVX512] in { def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIZrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; - // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), (VMOVZPQILo2PQIZrr VR128X:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIZrm addr:$src)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. - def : Pat<(v16i32 (X86vzload addr:$src)), + def : Pat<(v16i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8i64 (X86vzload addr:$src)), + def : Pat<(v8i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), + sub_xmm)>; } //===----------------------------------------------------------------------===// @@ -4686,7 +4517,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - IsCommutable>, AVX512BIBase, EVEX_4V, + IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -4922,7 +4753,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - IsCommutable>, + IsCommutable, IsCommutable>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, @@ -5458,16 +5289,14 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (VecNode _.RC:$src1, _.RC:$src2, - (i32 FROUND_CURRENT)))>, + (_.VT (VecNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, - _.ScalarIntMemCPat:$src2, - (i32 FROUND_CURRENT)))>, + _.ScalarIntMemCPat:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -5495,7 +5324,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$rc)), IsCommutable>, + (i32 timm:$rc))>, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, @@ -5534,23 +5363,22 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B, - Sched<[sched]>; + (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_B, Sched<[sched]>; } } multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode VecNode, X86SchedWriteSizes sched, - bit IsCommutable> { + SDNode VecNode, SDNode RndNode, + X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, sched.PS.Scl, IsCommutable>, - avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode, sched.PS.Scl, IsCommutable>, XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, sched.PD.Scl, IsCommutable>, - avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode, sched.PD.Scl, IsCommutable>, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } @@ -5565,17 +5393,17 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, VecNode, SaeNode, sched.PD.Scl, IsCommutable>, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } -defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds, SchedWriteFAddSizes, 1>; -defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds, SchedWriteFMulSizes, 1>; -defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds, SchedWriteFAddSizes, 0>; -defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds, SchedWriteFDivSizes, 0>; -defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds, +defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs, SchedWriteFCmpSizes, 0>; -defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds, +defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use @@ -5618,13 +5446,13 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable, - bit IsKZCommutable = IsCommutable> { + bit IsKCommutable = IsCommutable> { let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0, - IsKZCommutable>, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, + IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in { defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -5651,18 +5479,18 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", - (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNodeRnd, + SDPatternOperator OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, + (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>, EVEX_4V, EVEX_B, Sched<[sched]>; } @@ -5731,10 +5559,10 @@ defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, SchedWriteFCmpSizes, 0>, - avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>; + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, SchedWriteFCmpSizes, 0>, - avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>; + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>; let isCodeGenOnly = 1 in { defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, SchedWriteFCmpSizes, 1>; @@ -5750,71 +5578,25 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -let Predicates = [HasVLX,HasDQI] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), - (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), - FR64X)>; - - def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; - def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), - FR32X)>; -} - multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V, Sched<[sched]>; defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))), - (i32 FROUND_CURRENT))>, + (_.ScalarLdFrag addr:$src2))))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5825,332 +5607,139 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2, - (i32 FROUND_CURRENT))>, + (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, - SDNode OpNode, SDNode OpNodeScal, X86SchedWriteWidths sched> { - defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>, - avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>, + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>, + avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>, - avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>, + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>, + avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>, - avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>, - EVEX_4V,EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>, - avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>, - EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, + X86scalefsRnd, sched.Scl>, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, + X86scalefsRnd, sched.Scl>, + EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>, + defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>, + defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>, + defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>, + defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs, +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { - let ExeDomain = _.ExeDomain in { - let isCommutable = 1 in + // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG. + // There are just too many permuations due to commutability and bitcasts. + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>, + (null_frag), (null_frag), 1>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)), - _.ImmAllZerosV)>, + (null_frag), (null_frag)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - - // Patterns for compare with 0 that just use the same source twice. - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr") - _.RC:$src, _.RC:$src))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), - (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk") - _.KRC:$mask, _.RC:$src, _.RC:$src))>; } -multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode (and _.RC:$src1, - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))), - _.ImmAllZerosV)>, + (null_frag), (null_frag)>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo, - X86VectorVTInfo _, string Name> { - def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast<Instruction>(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast<Instruction>(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; - - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast<Instruction>(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast<Instruction>(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { +multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512; + defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>, + avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256; - defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128; - } - let Predicates = [HasAVX512, NoVLX] in { - defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>; - defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>; + defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>, + avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256; + defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>, + avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128; } } -multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode, +multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { - defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched, + defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched, avx512vl_i32_info>; - defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched, + defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched, avx512vl_i64_info>, VEX_W; } multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> { + X86SchedWriteWidths sched> { let Predicates = [HasBWI] in { - defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM, + defm WZ: avx512_vptest<opc, OpcodeStr#"w", sched.ZMM, v32i16_info, NAME#"W">, EVEX_V512, VEX_W; - defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM, + defm BZ: avx512_vptest<opc, OpcodeStr#"b", sched.ZMM, v64i8_info, NAME#"B">, EVEX_V512; } let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM, + defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM, v16i16x_info, NAME#"W">, EVEX_V256, VEX_W; - defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM, + defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM, v8i16x_info, NAME#"W">, EVEX_V128, VEX_W; - defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM, + defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM, v32i8x_info, NAME#"B">, EVEX_V256; - defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM, + defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM, v16i8x_info, NAME#"B">, EVEX_V128; } - - let Predicates = [HasBWI, NoVLX] in { - defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">; - defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">; - defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">; - defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">; - } } -// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm -// as commutable here because we already canonicalized all zeros vectors to the -// RHS during lowering. -def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETEQ)>; -def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETNE)>; - multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> : - avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>, - avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>; + X86SchedWriteWidths sched> : + avx512_vptest_wb<opc_wb, OpcodeStr, sched>, + avx512_vptest_dq<opc_dq, OpcodeStr, sched>; -defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", SchedWriteVecLogic>, T8PD; -defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", SchedWriteVecLogic>, T8XS; - -multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode, - X86VectorVTInfo _, - X86VectorVTInfo AndInfo> { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1, - _.RC:$src2)>; - - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV)), - (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV))), - (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1, - addr:$src2)>; -} - -// Patterns to use 512-bit instructions when 128/256 are not available. -multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode, - X86VectorVTInfo _, - X86VectorVTInfo AndInfo, - X86VectorVTInfo ExtendInfo> { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast<Instruction>(InstrStr#"rr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast<Instruction>(InstrStr#"rrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode, - Predicate prd, - AVX512VLVectorVTInfo CmpInfo, - AVX512VLVectorVTInfo AndInfo> { -let Predicates = [prd, HasVLX] in { - defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode, - CmpInfo.info128, AndInfo.info128>; - defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode, - CmpInfo.info256, AndInfo.info256>; -} -let Predicates = [prd] in { - defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode, - CmpInfo.info512, AndInfo.info512>; -} - -let Predicates = [prd, NoVLX] in { - defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, - CmpInfo.info128, AndInfo.info128, - CmpInfo.info512>; - defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, - CmpInfo.info256, AndInfo.info256, - CmpInfo.info512>; -} -} - -multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode> { - defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI, - avx512vl_i8_info, avx512vl_i16_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI, - avx512vl_i8_info, avx512vl_i32_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI, - avx512vl_i8_info, avx512vl_i64_info>; - - defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI, - avx512vl_i16_info, avx512vl_i8_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI, - avx512vl_i16_info, avx512vl_i32_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI, - avx512vl_i16_info, avx512vl_i64_info>; - - defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512, - avx512vl_i32_info, avx512vl_i8_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512, - avx512vl_i32_info, avx512vl_i16_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512, - avx512vl_i32_info, avx512vl_i64_info>; - - defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512, - avx512vl_i64_info, avx512vl_i8_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512, - avx512vl_i64_info, avx512vl_i16_info>; - defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512, - avx512vl_i64_info, avx512vl_i32_info>; -} - -defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>; -defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>; - //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// @@ -6427,118 +6016,22 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, } } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>, - avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>; +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", X86vshlv, SchedWriteVarVecShift>, + avx512_var_shift_w<0x12, "vpsllvw", X86vshlv, SchedWriteVarVecShift>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>, - avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", X86vsrav, SchedWriteVarVecShift>, + avx512_var_shift_w<0x11, "vpsravw", X86vsrav, SchedWriteVarVecShift>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>, - avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecShift>, + avx512_var_shift_w<0x10, "vpsrlvw", X86vsrlv, SchedWriteVarVecShift>; defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; -defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>; - -// Special handing for handling VPSRAV intrinsics. -multiclass avx512_var_shift_int_lowering<string InstrStr, SDNode OpNode, - X86VectorVTInfo _, list<Predicate> p> { - let Predicates = p in { - def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2)), - (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1, - _.RC:$src2)>; - def : Pat<(_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), - (!cast<Instruction>(InstrStr#_.ZSuffix##rm) - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, _.RC:$src2), _.RC:$src0)), - (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), - _.RC:$src0)), - (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), - (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, - _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), - _.ImmAllZerosV)), - (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, - _.RC:$src1, addr:$src2)>; - } -} - -multiclass avx512_var_shift_int_lowering_mb<string InstrStr, SDNode OpNode, - X86VectorVTInfo _, - list<Predicate> p> : - avx512_var_shift_int_lowering<InstrStr, OpNode, _, p> { - let Predicates = p in { - def : Pat<(_.VT (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - (!cast<Instruction>(InstrStr#_.ZSuffix##rmb) - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2))), - _.RC:$src0)), - (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, - _.KRC:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2))), - _.ImmAllZerosV)), - (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask, - _.RC:$src1, addr:$src2)>; - } -} - -multiclass avx512_var_shift_int_lowering_vl<string InstrStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, - Predicate p> { - defm : avx512_var_shift_int_lowering<InstrStr, OpNode, VTInfo.info512, [p]>; - defm : avx512_var_shift_int_lowering<InstrStr, OpNode, VTInfo.info256, - [HasVLX, p]>; - defm : avx512_var_shift_int_lowering<InstrStr, OpNode, VTInfo.info128, - [HasVLX, p]>; -} - -multiclass avx512_var_shift_int_lowering_mb_vl<string InstrStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, - Predicate p> { - defm : avx512_var_shift_int_lowering_mb<InstrStr, OpNode, VTInfo.info512, [p]>; - defm : avx512_var_shift_int_lowering_mb<InstrStr, OpNode, VTInfo.info256, - [HasVLX, p]>; - defm : avx512_var_shift_int_lowering_mb<InstrStr, OpNode, VTInfo.info128, - [HasVLX, p]>; -} - -defm : avx512_var_shift_int_lowering_vl<"VPSRAVW", X86vsrav, avx512vl_i16_info, - HasBWI>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSRAVD", X86vsrav, - avx512vl_i32_info, HasAVX512>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSRAVQ", X86vsrav, - avx512vl_i64_info, HasAVX512>; - -defm : avx512_var_shift_int_lowering_vl<"VPSRLVW", X86vsrlv, avx512vl_i16_info, - HasBWI>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSRLVD", X86vsrlv, - avx512vl_i32_info, HasAVX512>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSRLVQ", X86vsrlv, - avx512vl_i64_info, HasAVX512>; - -defm : avx512_var_shift_int_lowering_vl<"VPSLLVW", X86vshlv, avx512vl_i16_info, - HasBWI>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSLLVD", X86vshlv, - avx512vl_i32_info, HasAVX512>; -defm : avx512_var_shift_int_lowering_mb_vl<"VPSLLVQ", X86vshlv, - avx512vl_i64_info, HasAVX512>; +defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>; // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. @@ -6860,17 +6353,20 @@ let Predicates = [HasAVX512] in { def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; } let SchedRW = [WriteFStore] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), - (bc_v2f64 (v4f32 VR128X:$src))), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhpd\t{$src, $dst|$dst, $src}", @@ -6878,12 +6374,11 @@ def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), (iPTR 0))), addr:$dst)]>, EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlpd\t{$src, $dst|$dst, $src}", @@ -6936,7 +6431,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -7011,7 +6506,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))), 1, 1, vselect, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -7089,7 +6584,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), + (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))), 1, 1, vselect, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -7165,7 +6660,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc), !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"), !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; }// isCodeGenOnly = 1 @@ -7184,7 +6679,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, (_.ScalarLdFrag addr:$src3)))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1, - _.FRC:$src3, (i32 imm:$rc)))), 0>; + _.FRC:$src3, (i32 timm:$rc)))), 0>; defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, @@ -7192,7 +6687,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3, - _.FRC:$src1, (i32 imm:$rc)))), 1>; + _.FRC:$src1, (i32 timm:$rc)))), 1>; // One pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -7202,7 +6697,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), _.FRC:$src1, _.FRC:$src2))), (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3, - _.FRC:$src2, (i32 imm:$rc)))), 1>; + _.FRC:$src2, (i32 timm:$rc)))), 1>; } } @@ -7366,62 +6861,62 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3, (i32 imm:$rc)))))), + _.FRC:$src3, (i32 timm:$rc)))))), (!cast<I>(Prefix#"213"#Suffix#"Zrb_Int") VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)))))), + (i32 timm:$rc)))))), (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int") VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3, (i32 imm:$rc)), + _.FRC:$src3, (i32 timm:$rc)), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)), + (i32 timm:$rc)), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3, (i32 imm:$rc)), + _.FRC:$src3, (i32 timm:$rc)), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (i32 imm:$rc)), + (i32 timm:$rc)), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; } } @@ -7501,44 +6996,44 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, // AVX-512 Scalar convert from sign integer to float/double //===----------------------------------------------------------------------===// -multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched, +multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, - X86MemOperand x86memop, PatFrag ld_frag, string asm> { - let hasSideEffects = 0 in { + X86MemOperand x86memop, PatFrag ld_frag, string asm, + string mem> { + let hasSideEffects = 0, isCodeGenOnly = 1 in { def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), (ins DstVT.FRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - EVEX_4V, Sched<[sched]>; + EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst), (ins DstVT.FRC:$src1, x86memop:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } // hasSideEffects = 0 - let isCodeGenOnly = 1 in { - def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), - (ins DstVT.RC:$src1, SrcRC:$src2), - !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set DstVT.RC:$dst, - (OpNode (DstVT.VT DstVT.RC:$src1), - SrcRC:$src2, - (i32 FROUND_CURRENT)))]>, - EVEX_4V, Sched<[sched]>; - - def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), - (ins DstVT.RC:$src1, x86memop:$src2), - !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set DstVT.RC:$dst, - (OpNode (DstVT.VT DstVT.RC:$src1), - (ld_frag addr:$src2), - (i32 FROUND_CURRENT)))]>, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - }//isCodeGenOnly = 1 + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>, + EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>; + + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, x86memop:$src2), + asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + (ld_frag addr:$src2)))]>, + EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst, + DstVT.RC:$src1, SrcRC:$src2), 0, "att">; } multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, - X86VectorVTInfo DstVT, string asm> { + X86VectorVTInfo DstVT, string asm, + string mem> { def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), !strconcat(asm, @@ -7546,37 +7041,44 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 imm:$rc)))]>, - EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; + (i32 timm:$rc)))]>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>; + def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}", + (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst, + DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">; } -multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, +multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, - X86MemOperand x86memop, PatFrag ld_frag, string asm> { - defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, string mem> { + defm NAME : avx512_vcvtsi_round<opc, OpNodeRnd, sched, SrcRC, DstVT, asm, mem>, avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop, - ld_frag, asm>, VEX_LIG; + ld_frag, asm, mem>, VEX_LIG; } let Predicates = [HasAVX512] in { -defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32, - v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SS, GR32, + v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64, - v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32, - v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, - XD, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64, - v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, +defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32, + v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, + WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -7596,23 +7098,26 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SS, GR32, v4f32x_info, i32mem, loadi32, - "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64, - v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, + "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info, - i32mem, loadi32, "cvtusi2sd{l}">, +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info, + i32mem, loadi32, "cvtusi2sd", "l">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64, - v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, + WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; @@ -7641,8 +7146,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, X86VectorVTInfo DstVT, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, string asm, - string aliasStr, - bit CodeGenOnly = 1> { + string aliasStr> { let Predicates = [HasAVX512] in { def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), @@ -7650,34 +7154,23 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, EVEX, VEX_LIG, Sched<[sched]>; def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc), !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), - [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>, + [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>, EVEX, VEX_LIG, EVEX_B, EVEX_RC, Sched<[sched]>; - let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; - - def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; - def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}", - (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">; } // Predicates = [HasAVX512] -} -multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT, - X86VectorVTInfo DstVT, SDNode OpNode, - SDNode OpNodeRnd, - X86FoldableSchedWrite sched, string asm, - string aliasStr> : - avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, OpNodeRnd, sched, asm, aliasStr, 0> { - let Predicates = [HasAVX512] in { - def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst, - SrcVT.IntScalarMemOp:$src), 0, "att">; - } // Predicates = [HasAVX512] + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; + def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}", + (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">; + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst, + SrcVT.IntScalarMemOp:$src), 0, "att">; } // Convert float/double to signed/unsigned int 32/64 @@ -7687,10 +7180,10 @@ defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si, defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si, X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info, X86cvts2usi, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info, X86cvts2usi, +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si, @@ -7699,10 +7192,10 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si, defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si, X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info, X86cvts2usi, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info, X86cvts2usi, +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi, X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -7793,19 +7286,18 @@ def : Pat<(v2f64 (X86Movsd // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeInt, SDNode OpNodeRnd, - X86FoldableSchedWrite sched, string aliasStr, - bit CodeGenOnly = 1>{ + SDNode OpNodeInt, SDNode OpNodeSAE, + X86FoldableSchedWrite sched, string aliasStr>{ let Predicates = [HasAVX512] in { let isCodeGenOnly = 1 in { def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, - EVEX, Sched<[sched]>; + EVEX, VEX_LIG, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, - EVEX, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; } def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), @@ -7814,63 +7306,49 @@ let Predicates = [HasAVX512] in { EVEX, VEX_LIG, Sched<[sched]>; def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), - [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), - (i32 FROUND_NO_EXC)))]>, - EVEX,VEX_LIG , EVEX_B, Sched<[sched]>; - let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in + [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>, + EVEX, VEX_LIG, EVEX_B, Sched<[sched]>; def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; +} //HasAVX512 def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}", (!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; -} //HasAVX512 -} - -multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm, - X86VectorVTInfo _SrcRC, - X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeInt, SDNode OpNodeRnd, - X86FoldableSchedWrite sched, - string aliasStr> : - avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeInt, OpNodeRnd, sched, - aliasStr, 0> { -let Predicates = [HasAVX512] in { def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst, _SrcRC.IntScalarMemOp:$src), 0, "att">; } -} defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSS2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntRnd, WriteCvtSD2I, + fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSS2I, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntRnd, WriteCvtSD2I, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, + fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; //===----------------------------------------------------------------------===// @@ -7884,15 +7362,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), - (i32 FROUND_CURRENT)))>, + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, Sched<[sched]>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT _Src.ScalarIntMemCPat:$src2), - (i32 FROUND_CURRENT)))>, + (_Src.VT _Src.ScalarIntMemCPat:$src2)))>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -7911,14 +7387,13 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (_.VT (OpNodeRnd (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), - (i32 FROUND_NO_EXC)))>, + (_.VT (OpNodeSAE (_.VT _.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; } @@ -7930,34 +7405,36 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (_.VT (OpNodeRnd (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>, EVEX_4V, VEX_LIG, Sched<[sched]>, EVEX_B, EVEX_RC; } multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, - SDNode OpNodeRnd, X86FoldableSchedWrite sched, - X86VectorVTInfo _src, X86VectorVTInfo _dst> { + SDNode OpNode, SDNode OpNodeRnd, + X86FoldableSchedWrite sched, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>, avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; } } -multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, - avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>, EVEX_CD8<32, CD8VT1>, XS; } } -defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", - X86froundRnd, WriteCvtSD2SS, f64x_info, +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds, + X86froundsRnd, WriteCvtSD2SS, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", - X86fpextRnd, WriteCvtSS2SD, f32x_info, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts, + X86fpextsSAE, WriteCvtSS2SD, f32x_info, f64x_info>; def : Pat<(f64 (fpextend FR32X:$src)), @@ -7967,14 +7444,6 @@ def : Pat<(f64 (fpextend (loadf32 addr:$src))), (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(f64 (extloadf32 addr:$src)), - (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX512, OptForSize]>; - -def : Pat<(f64 (extloadf32 addr:$src)), - (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, - Requires<[HasAVX512, OptForSpeed]>; - def : Pat<(f32 (fpround FR64X:$src)), (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; @@ -8003,7 +7472,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86FoldableSchedWrite sched, string Broadcast = _.BroadcastStr, string Alias = "", X86MemOperand MemOp = _Src.MemOp, - RegisterClass MaskRC = _.KRCWM> { + RegisterClass MaskRC = _.KRCWM, + dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> { defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), @@ -8022,12 +7492,8 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (ins _.RC:$src0, MaskRC:$mask, MemOp:$src), (ins MaskRC:$mask, MemOp:$src), OpcodeStr#Alias, "$src", "$src", - (_.VT (OpNode (_Src.VT - (_Src.LdFrag addr:$src)))), - (vselect MaskRC:$mask, - (_.VT (OpNode (_Src.VT - (_Src.LdFrag addr:$src)))), - _.RC:$src0), + LdDAG, + (vselect MaskRC:$mask, LdDAG, _.RC:$src0), vselect, "$src0 = $dst">, EVEX, Sched<[sched.Folded]>; @@ -8052,13 +7518,12 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, } // Coversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", - (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), - (i32 FROUND_NO_EXC)))>, + (_.VT (OpNodeSAE (_Src.VT _Src.RC:$src)))>, EVEX, EVEX_B, Sched<[sched]>; } @@ -8069,23 +7534,34 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", - (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>, + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 timm:$rc)))>, EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; } +// Similar to avx512_vcvt_fp, but uses an extload for the memory form. +multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + X86FoldableSchedWrite sched, + string Broadcast = _.BroadcastStr, + string Alias = "", X86MemOperand MemOp = _Src.MemOp, + RegisterClass MaskRC = _.KRCWM> + : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias, + MemOp, MaskRC, + (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>; + // Extend Float to Double multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, + defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, - X86vfpextRnd, sched.ZMM>, EVEX_V512; + X86vfpextSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, + defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info, X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend, + defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend, sched.YMM>, EVEX_V256; } } @@ -8093,7 +7569,7 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, // Truncate Double to Float multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfpround, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfproundRnd, sched.ZMM>, EVEX_V512; } @@ -8101,18 +7577,49 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround, + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86vfpround, sched.YMM, "{1to4}", "{y}">, EVEX_V256; - - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; } + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, @@ -8120,20 +7627,66 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>, PS, EVEX_CD8<32, CD8VH>; -def : Pat<(v8f64 (extloadv8f32 addr:$src)), - (VCVTPS2PDZrm addr:$src)>; +let Predicates = [HasAVX512] in { + def : Pat<(v8f32 (fpround (v8f64 VR512:$src))), + (VCVTPD2PSZrr VR512:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), + VR256X:$src0), + (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>; + + def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), + VR256X:$src0), + (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; + + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + (VCVTPD2PSZrmb addr:$src)>; + def : Pat<(vselect VK8WM:$mask, + (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (v8f32 VR256X:$src0)), + (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(vselect VK8WM:$mask, + (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + v8f32x_info.ImmAllZerosV), + (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; +} let Predicates = [HasVLX] in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), - (VCVTPD2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSZ128rm addr:$src)>; - def : Pat<(v2f64 (extloadv2f32 addr:$src)), - (VCVTPS2PDZ128rm addr:$src)>; - def : Pat<(v4f64 (extloadv4f32 addr:$src)), - (VCVTPS2PDZ256rm addr:$src)>; + def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))), + (VCVTPD2PSZ256rr VR256X:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), + VR128X:$src0), + (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>; + + def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), + (VCVTPD2PSZ256rm addr:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), + VR128X:$src0), + (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (VCVTPD2PSZ256rmb addr:$src)>; + def : Pat<(vselect VK4WM:$mask, + (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + VR128X:$src0), + (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(vselect VK4WM:$mask, + (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + v4f32x_info.ImmAllZerosV), + (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; // Special patterns to allow use of X86vmfpround for masking. Instruction // patterns have been disabled with null_frag. @@ -8175,7 +7728,11 @@ multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, - OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128; + OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM, + (v2f64 (OpNode128 (bc_v4i32 + (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode, sched.YMM>, EVEX_V256; } @@ -8200,12 +7757,12 @@ multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, // Convert Float to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, - OpNodeRnd, sched.ZMM>, EVEX_V512; + OpNodeSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, @@ -8234,12 +7791,12 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Convert Double to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, - OpNodeRnd, sched.ZMM>, EVEX_V512; + OpNodeSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 @@ -8251,16 +7808,49 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256; - - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; } + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } // Convert Double to Signed/Unsigned Doubleword @@ -8282,16 +7872,47 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256; - - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; } + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, f64mem:$src), 0, "att">; + + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, + f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, f64mem:$src), 0, "att">; } // Convert Double to Signed/Unsigned Quardword @@ -8358,7 +7979,11 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, sched.YMM>, EVEX_V256; } @@ -8376,7 +8001,11 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, sched.YMM>, EVEX_V256; } @@ -8384,8 +8013,7 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Convert Signed/Unsigned Quardword to Float multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, SDNode OpNodeRnd, - X86SchedWriteWidths sched> { + SDNode OpNodeRnd, X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode, sched.ZMM>, @@ -8397,22 +8025,57 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, // memory forms of these instructions in Asm Parcer. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128, - sched.XMM, "{1to2}", "{x}">, EVEX_V128, - NotEVEX2VEXConvertible; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag, + sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>, + EVEX_V128, NotEVEX2VEXConvertible; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256, NotEVEX2VEXConvertible; - - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; } + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to2}}", + (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|" + "$dst {${mask}}, $src}", + (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, $src}", + (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + "$dst {${mask}}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, ${src}{1to4}}", + (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; } defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, @@ -8423,19 +8086,19 @@ defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPS2DQ>, + X86cvttp2siSAE, SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPD2DQ>, + X86cvttp2siSAE, SchedWriteCvtPD2DQ>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS, + X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, + X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, PS, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, @@ -8479,19 +8142,19 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, EVEX_CD8<32, CD8VH>; defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si, - X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD, + X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui, - X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD, + X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, @@ -8502,67 +8165,15 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP, +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP, +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, EVEX_CD8<64, CD8VF>; -let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))), - (VCVTTPS2DQZrr VR512:$src)>; - def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))), - (VCVTTPS2DQZrm addr:$src)>; - - def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))), - (VCVTTPS2UDQZrr VR512:$src)>; - def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))), - (VCVTTPS2UDQZrm addr:$src)>; - - def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))), - (VCVTTPD2DQZrr VR512:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))), - (VCVTTPD2DQZrm addr:$src)>; - - def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))), - (VCVTTPD2UDQZrr VR512:$src)>; - def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))), - (VCVTTPD2UDQZrm addr:$src)>; -} - let Predicates = [HasVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))), - (VCVTTPS2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2DQZ128rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))), - (VCVTTPS2UDQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))), - (VCVTTPS2UDQZ128rm addr:$src)>; - - def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))), - (VCVTTPS2DQZ256rr VR256X:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2DQZ256rm addr:$src)>; - - def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))), - (VCVTTPS2UDQZ256rr VR256X:$src)>; - def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))), - (VCVTTPS2UDQZ256rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))), - (VCVTTPD2DQZ256rr VR256X:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), - (VCVTTPD2DQZ256rm addr:$src)>; - - def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))), - (VCVTTPD2UDQZ256rr VR256X:$src)>; - def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))), - (VCVTTPD2UDQZ256rm addr:$src)>; - // Special patterns to allow use of X86mcvtp2Int for masking. Instruction // patterns have been disabled with null_frag. def : Pat<(v4i32 (X86cvtp2Int (v2f64 VR128X:$src))), @@ -8680,72 +8291,64 @@ let Predicates = [HasVLX] in { (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } -let Predicates = [HasDQI] in { - def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))), - (VCVTTPS2QQZrr VR256X:$src)>; - def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2QQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))), - (VCVTTPS2UQQZrr VR256X:$src)>; - def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))), - (VCVTTPS2UQQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))), - (VCVTTPD2QQZrr VR512:$src)>; - def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))), - (VCVTTPD2QQZrm addr:$src)>; - - def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))), - (VCVTTPD2UQQZrr VR512:$src)>; - def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))), - (VCVTTPD2UQQZrm addr:$src)>; -} - let Predicates = [HasDQI, HasVLX] in { - def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))), - (VCVTTPS2QQZ256rr VR128X:$src)>; - def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2QQZ256rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))), - (VCVTTPS2UQQZ256rr VR128X:$src)>; - def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))), - (VCVTTPS2UQQZ256rm addr:$src)>; - - def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))), - (VCVTTPD2QQZ128rr VR128X:$src)>; - def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))), - (VCVTTPD2QQZ128rm addr:$src)>; - - def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))), - (VCVTTPD2UQQZ128rr VR128X:$src)>; - def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))), - (VCVTTPD2UQQZ128rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))), - (VCVTTPD2QQZ256rr VR256X:$src)>; - def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))), - (VCVTTPD2QQZ256rm addr:$src)>; - - def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))), - (VCVTTPD2UQQZ256rr VR256X:$src)>; - def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))), - (VCVTTPD2UQQZ256rm addr:$src)>; + def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + (VCVTTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), +def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), +def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))), +def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; @@ -8771,80 +8374,117 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), VR128X:$src1, sub_xmm)))), sub_xmm)>; } -let Predicates = [HasAVX512, HasVLX] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), - (VCVTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), - (VCVTPD2UDQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), - (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQZ128rm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), - (VCVTTPD2UDQZ128rr VR128X:$src)>; - - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), +let Predicates = [HasVLX] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; - - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTUDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), - (VCVTPD2PSZrm addr:$src)>; - def : Pat<(v8f64 (extloadv8f32 addr:$src)), - (VCVTPS2PDZrm addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), + (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasDQI, HasVLX] in { - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), + // Special patterns to allow use of X86VMSintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))), (VCVTQQ2PSZ128rr VR128X:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))), + (VCVTQQ2PSZ128rm addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + (VCVTQQ2PSZ128rmb addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + (v4f32 VR128X:$src0), VK2WM:$mask), + (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + v4f32x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; + + // Special patterns to allow use of X86VMUintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))), (VCVTUQQ2PSZ128rr VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))), + (VCVTUQQ2PSZ128rm addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + (VCVTUQQ2PSZ128rmb addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + (v4f32 VR128X:$src0), VK2WM:$mask), + (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + v4f32x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasDQI, NoVLX] in { -def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))), +def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))), +def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_ymm)>; -def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))), +def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))), +def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))), +def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_ymm)>; -def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))), +def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -8903,8 +8543,7 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "{sae}, $src", "$src, {sae}", - (X86cvtph2psRnd (_src.VT _src.RC:$src), - (i32 FROUND_NO_EXC))>, + (X86cvtph2psSAE (_src.VT _src.RC:$src))>, T8PD, EVEX_B, Sched<[sched]>; } @@ -8923,9 +8562,7 @@ let Predicates = [HasVLX] in { EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), - (VCVTPH2PSZ128rm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSZ128rm addr:$src)>; def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), @@ -9088,12 +8725,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, - EVEX_4V, Sched<[sched]>; + EVEX_4V, VEX_LIG, Sched<[sched]>; defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - _.ScalarIntMemCPat:$src2)>, EVEX_4V, + _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -9162,47 +8799,45 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode OpNode, X86FoldableSchedWrite sched> { + SDNode OpNode, SDNode OpNodeSAE, + X86FoldableSchedWrite sched> { let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_CURRENT))>, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, Sched<[sched]>; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B, - Sched<[sched]>; + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_B, Sched<[sched]>; defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - (i32 FROUND_CURRENT))>, + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched> { - defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>, - EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>, - EVEX_CD8<64, CD8VT1>, VEX_W; + SDNode OpNodeSAE, X86FoldableSchedWrite sched> { + defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE, + sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG; + defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE, + sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } let Predicates = [HasERI] in { - defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>, - T8PD, EVEX_4V; - defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs, + SchedWriteFRcp.Scl>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs, SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V; } -defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, SchedWriteFRnd.Scl>, T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd @@ -9211,42 +8846,40 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>, + (OpNode (_.VT _.RC:$src))>, Sched<[sched]>; defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", (OpNode (_.VT - (bitconvert (_.LdFrag addr:$src))), - (i32 FROUND_CURRENT))>, + (bitconvert (_.LdFrag addr:$src))))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))), - (i32 FROUND_CURRENT))>, EVEX_B, - Sched<[sched.Folded, sched.ReadAfterFold]>; + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } -multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, +multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, X86FoldableSchedWrite sched> { let ExeDomain = _.ExeDomain in defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", - (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, + (OpNode (_.VT _.RC:$src))>, EVEX_B, Sched<[sched]>; } multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched> { + SDNode OpNodeSAE, X86SchedWriteWidths sched> { defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>, - avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>, + avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>, - avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>, + avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>, T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -9254,24 +8887,32 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>, - EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>, - EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>, - EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>, - EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, + sched.XMM>, + EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, + sched.YMM>, + EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, + sched.XMM>, + EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, + sched.YMM>, + EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; } } let Predicates = [HasERI] in { - defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX; - defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX; - defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX; -} -defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>, - avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd, + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE, + SchedWriteFRsqrt>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE, + SchedWriteFRcp>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE, + SchedWriteFAdd>, EVEX; +} +defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, + SchedWriteFRnd>, + avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, EVEX; multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, @@ -9279,7 +8920,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", - (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>, + (_.VT (X86fsqrtRnd _.RC:$src, (i32 timm:$rc)))>, EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; } @@ -9345,23 +8986,21 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (X86fsqrtRnds (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (i32 FROUND_CURRENT))>, + (X86fsqrts (_.VT _.RC:$src1), + (_.VT _.RC:$src2))>, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (X86fsqrtRnds (_.VT _.RC:$src1), - _.ScalarIntMemCPat:$src2, - (i32 FROUND_CURRENT))>, + (X86fsqrts (_.VT _.RC:$src1), + _.ScalarIntMemCPat:$src2)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (X86fsqrtRnds (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$rc))>, + (i32 timm:$rc))>, EVEX_B, EVEX_RC, Sched<[sched]>; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in { @@ -9416,8 +9055,8 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", - (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B, + (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3)))>, EVEX_B, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -9443,50 +9082,26 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, } let Predicates = [HasAVX512] in { - def : Pat<(ffloor _.FRC:$src), - (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0x9)))>; - def : Pat<(fceil _.FRC:$src), + def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2), (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xa)))>; - def : Pat<(ftrunc _.FRC:$src), - (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xb)))>; - def : Pat<(frint _.FRC:$src), - (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0x4)))>; - def : Pat<(fnearbyint _.FRC:$src), - (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src, (i32 0xc)))>; + _.FRC:$src1, imm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0x9)))>; - def : Pat<(fceil (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xa)))>; - def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), + def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2), (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xb)))>; - def : Pat<(frint (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0x4)))>; - def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), - (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src, (i32 0xc)))>; + addr:$src1, imm:$src2))>; } } defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless", SchedWriteFRnd.Scl, f32x_info>, - AVX512AIi8Base, EVEX_4V, + AVX512AIi8Base, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd", SchedWriteFRnd.Scl, f64x_info>, - VEX_W, AVX512AIi8Base, EVEX_4V, + VEX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move, @@ -9514,32 +9129,6 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd, (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info, fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>; -multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move, - X86VectorVTInfo _, PatLeaf ZeroFP, - bits<8> ImmV, Predicate BasePredicate> { - let Predicates = [BasePredicate] in { - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, - (OpNode (extractelt _.VT:$src2, (iPTR 0))), - (extractelt _.VT:$dst, (iPTR 0))))), - (!cast<Instruction>("V"#OpcPrefix#Zr_Intk) - _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; - - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, - (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), - (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz) - VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; - } -} - -defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss, - v4f32x_info, fp32imm0, 0x01, HasAVX512>; -defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss, - v4f32x_info, fp32imm0, 0x02, HasAVX512>; -defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd, - v2f64x_info, fp64imm0, 0x01, HasAVX512>; -defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd, - v2f64x_info, fp64imm0, 0x02, HasAVX512>; - //------------------------------------------------- // Integer truncate and extend operations @@ -9999,26 +9588,14 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> { let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; } // 512-bit patterns @@ -10040,41 +9617,6 @@ multiclass AVX512_pmovx_patterns_base<string OpcPrefix, SDNode ExtOp> { } } -multiclass AVX512_pmovx_patterns_aext<string OpcPrefix, SDNode ExtOp> : - AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> { - let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))), - (!cast<I>(OpcPrefix#BWZ256rr) VR128X:$src)>; - } - - let Predicates = [HasVLX] in { - def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))), - (!cast<I>(OpcPrefix#WDZ256rr) VR128X:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))), - (!cast<I>(OpcPrefix#DQZ256rr) VR128X:$src)>; - } - - // 512-bit patterns - let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))), - (!cast<I>(OpcPrefix#BWZrr) VR256X:$src)>; - } - let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))), - (!cast<I>(OpcPrefix#BDZrr) VR128X:$src)>; - def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))), - (!cast<I>(OpcPrefix#WDZrr) VR256X:$src)>; - - def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))), - (!cast<I>(OpcPrefix#WQZrr) VR128X:$src)>; - - def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))), - (!cast<I>(OpcPrefix#DQZrr) VR256X:$src)>; - } -} - - multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, SDNode InVecOp> : AVX512_pmovx_patterns_base<OpcPrefix, ExtOp> { @@ -10084,115 +9626,62 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))), - (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; } // 512-bit patterns let Predicates = [HasAVX512] in { def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))), - (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; } } defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; -defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; - -// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge -// ext+trunc aggresively making it impossible to legalize the DAG to this -// pattern directly. -let Predicates = [HasAVX512, NoBWI] in { -def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), - (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), - (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; -def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), - (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -} // Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge // ext+trunc aggresively making it impossible to legalize the DAG to this @@ -10200,10 +9689,8 @@ def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), let Predicates = [HasAVX512, NoBWI] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; -def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), +def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))), (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; -def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), - (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; } //===----------------------------------------------------------------------===// @@ -10490,7 +9977,7 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr, X86FoldableSchedWrite sched> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (X86compress _.RC:$src1))>, AVX5128IBase, + (null_frag)>, AVX5128IBase, Sched<[sched]>; let mayStore = 1, hasSideEffects = 0 in @@ -10512,6 +9999,13 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> { def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask), (!cast<Instruction>(Name#_.ZSuffix##mrk) addr:$dst, _.KRCWM:$mask, _.RC:$src)>; + + def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix##rrk) + _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; + def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix##rrkz) + _.KRCWM:$mask, _.RC:$src)>; } multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, @@ -10545,13 +10039,12 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr, X86FoldableSchedWrite sched> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (X86expand _.RC:$src1))>, AVX5128IBase, + (null_frag)>, AVX5128IBase, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (X86expand (_.VT (bitconvert - (_.LdFrag addr:$src1)))))>, + (null_frag)>, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10570,6 +10063,13 @@ multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> { (_.VT _.RC:$src0))), (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0, _.KRCWM:$mask, addr:$src)>; + + def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix##rrk) + _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; + def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix##rrkz) + _.KRCWM:$mask, _.RC:$src)>; } multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, @@ -10636,18 +10136,17 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, OpcodeStr##_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src2))>, EVEX_B, Sched<[sched]>; } multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, - avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>, EVEX_V512; } let Predicates = [prd, HasVLX] in { @@ -10766,8 +10265,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10781,17 +10279,16 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, + (i32 imm:$src3))>, EVEX_B, Sched<[sched]>; } multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ let Predicates = [prd] in { defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, - avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>, + avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>, EVEX_V512; } @@ -10835,267 +10332,64 @@ multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, X86VectorVTInfo _, bits<8> opc, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> { + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd> { let Predicates = [prd] in { defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>, - avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>; + avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeSAE, sched.XMM, _>; } } multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, bits<8> opcPs, bits<8> opcPd, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, - opcPs, OpNode, OpNodeRnd, sched, prd>, + opcPs, OpNode, OpNodeSAE, sched, prd>, EVEX_CD8<32, CD8VF>; defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, - opcPd, OpNode, OpNodeRnd, sched, prd>, + opcPd, OpNode, OpNodeSAE, sched, prd>, EVEX_CD8<64, CD8VF>, VEX_W; } defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, - X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>, + X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, - X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>, + X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, - X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>, + X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, - 0x50, X86VRange, X86VRangeRnd, + 0x50, X86VRange, X86VRangeSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info, - 0x50, X86VRange, X86VRangeRnd, + 0x50, X86VRange, X86VRangeSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", - f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, - 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, - 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, - 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, - 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, - 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; - -multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> { - // Register - def : Pat<(_.VT (ffloor _.RC:$src)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint _.RC:$src)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil _.RC:$src)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (frint _.RC:$src)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc _.RC:$src)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") - _.RC:$src, (i32 0xB))>; - - // Merge-masking - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") - _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; - - // Zero-masking - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") - _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; - - // Load - def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil (_.LdFrag addr:$src))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xA))>; - def : Pat<(_.VT (frint (_.LdFrag addr:$src))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") - addr:$src, (i32 0xB))>; - - // Merge-masking + load - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Zero-masking + load - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") - _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Broadcast load - def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0x9))>; - def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xC))>; - def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xA))>; - def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0x4))>; - def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") - addr:$src, (i32 0xB))>; - - // Merge-masking + broadcast load - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.RC:$dst)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") - _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; - - // Zero-masking + broadcast load - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0x9))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xC))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xA))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0x4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), - _.ImmAllZerosV)), - (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") - _.KRCWM:$mask, addr:$src, (i32 0xB))>; -} - -let Predicates = [HasAVX512] in { - defm : AVX512_rndscale_lowering<v16f32_info, "PS">; - defm : AVX512_rndscale_lowering<v8f64_info, "PD">; -} - -let Predicates = [HasVLX] in { - defm : AVX512_rndscale_lowering<v8f32x_info, "PS">; - defm : AVX512_rndscale_lowering<v4f64x_info, "PD">; - defm : AVX512_rndscale_lowering<v4f32x_info, "PS">; - defm : AVX512_rndscale_lowering<v2f64x_info, "PD">; -} - multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, @@ -11577,9 +10871,9 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), @@ -11587,21 +10881,21 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), - (bitconvert (v4i32 immAllZerosV))), + immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), - (bitconvert (v4i32 immAllZerosV))), + immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), - (bitconvert (v4i32 immAllZerosV))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), + immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -12100,39 +11394,39 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, // TODO: We should maybe have a more generalized algorithm for folding to // vpternlog. let Predicates = [HasAVX512] in { - def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v64i8 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v32i16 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v16i32 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; - def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))), + def : Pat<(xor VR512:$src, (v8i64 immAllOnesV)), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } let Predicates = [HasAVX512, NoVLX] in { - def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), @@ -12140,28 +11434,28 @@ let Predicates = [HasAVX512, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (i8 15)), sub_ymm)>; - def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), @@ -12171,22 +11465,22 @@ let Predicates = [HasAVX512, NoVLX] in { } let Predicates = [HasVLX] in { - def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v16i8 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v8i16 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v4i32 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + def : Pat<(xor VR128X:$src, (v2i64 immAllOnesV)), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v32i8 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v16i16 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v8i32 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; - def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), + def : Pat<(xor VR256X:$src, (v4i64 immAllOnesV)), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; } @@ -12194,58 +11488,55 @@ let Predicates = [HasVLX] in { // AVX-512 - FixupImm //===----------------------------------------------------------------------===// -multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo TblVT>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (TblVT.VT _.RC:$src3), - (i32 imm:$src4), - (i32 FROUND_CURRENT))>, Sched<[sched]>; + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4))>, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), - (i32 imm:$src4), - (i32 FROUND_CURRENT))>, + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), + (i32 imm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr##", $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4), - (i32 FROUND_CURRENT))>, + (X86VFixupimm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" } multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86FoldableSchedWrite sched, - X86VectorVTInfo _, X86VectorVTInfo TblVT>{ + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo TblVT> + : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", "$src2, $src3, {sae}, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (TblVT.VT _.RC:$src3), - (i32 imm:$src4), - (i32 FROUND_NO_EXC))>, + (X86VFixupimmSAE (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4))>, EVEX_B, Sched<[sched]>; } } -multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo _src3VT> { let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], @@ -12253,30 +11544,27 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4), - (i32 FROUND_CURRENT))>, Sched<[sched]>; + (X86VFixupimms (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4))>, Sched<[sched]>; defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", "$src2, $src3, {sae}, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4), - (i32 FROUND_NO_EXC))>, + (X86VFixupimmSAEs (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (_src3VT.VT (scalar_to_vector - (_src3VT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4), - (i32 FROUND_CURRENT))>, + (X86VFixupimms (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT (scalar_to_vector + (_src3VT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -12285,25 +11573,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _Vec, AVX512VLVectorVTInfo _Tbl> { let Predicates = [HasAVX512] in - defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, - _Vec.info512, _Tbl.info512>, - avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, + defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM, _Vec.info512, _Tbl.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM, + defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM, _Vec.info128, _Tbl.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128; - defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM, + defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM, _Vec.info256, _Tbl.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256; } } -defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, +defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", SchedWriteFAdd.Scl, f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, +defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", SchedWriteFAdd.Scl, f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info, @@ -12364,6 +11650,12 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo _.FRC:$src)))), (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst, (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; + def : Pat<(MoveNode + (_.VT VR128X:$dst), + (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + (_.ScalarLdFrag addr:$src))))), + (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -12377,6 +11669,16 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), + _.FRC:$src0))), + (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk) + (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), + VK1WM:$mask, _.VT:$src1, addr:$src2)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -12388,6 +11690,13 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo (!cast<I>("V"#OpcPrefix#Zrr_Intkz) VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))), + (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>; } } @@ -12413,26 +11722,6 @@ multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>; defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>; -multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, - SDNode Move, X86VectorVTInfo _, - bits<8> ImmV> { - let Predicates = [HasAVX512] in { - def : Pat<(_.VT (Move _.VT:$dst, - (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))), - (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src, - (i32 ImmV))>; - } -} - -defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss, - v4f32x_info, 0x01>; -defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss, - v4f32x_info, 0x02>; -defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd, - v2f64x_info, 0x01>; -defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd, - v2f64x_info, 0x02>; - //===----------------------------------------------------------------------===// // AES instructions //===----------------------------------------------------------------------===// @@ -12645,12 +11934,19 @@ defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU, defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>; defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>; +def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vpshufbitqmb node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst), (ins VTI.RC:$src1, VTI.RC:$src2), "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT VTI.RC:$src2)), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD, Sched<[sched]>; defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), @@ -12658,6 +11954,8 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT (VTI.LdFrag addr:$src2))), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src2)))>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -12753,13 +12051,13 @@ defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info, defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info, (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), "v4fmaddss", "$src3, $src2", "$src2, $src3", - []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, Sched<[SchedWriteFMA.Scl.Folded]>; defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info, (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), "v4fnmaddss", "$src3, $src2", "$src2, $src3", - []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, Sched<[SchedWriteFMA.Scl.Folded]>; } @@ -12782,3 +12080,196 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, Sched<[SchedWriteFMA.ZMM.Folded]>; } +let hasSideEffects = 0 in { + let mayStore = 1 in + def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>; + let mayLoad = 1 in + def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>; +} + +//===----------------------------------------------------------------------===// +// VP2INTERSECT +//===----------------------------------------------------------------------===// + +multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { + def rr : I<0x68, MRMSrcReg, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat("vp2intersect", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT _.RC:$src2)))]>, + EVEX_4V, T8XD; + + def rm : I<0x68, MRMSrcMem, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), + !strconcat("vp2intersect", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : I<0x68, MRMSrcMem, + (outs _.KRPC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, + ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), + [(set _.KRPC:$dst, (X86vp2intersect + _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_vp2intersect<AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512, HasVP2INTERSECT] in + defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in { + defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256; + defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128; + } +} + +defm VP2INTERSECTD : avx512_vp2intersect<avx512vl_i32_info>; +defm VP2INTERSECTQ : avx512_vp2intersect<avx512vl_i64_info>, VEX_W; + +multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _SrcVTInfo, + AVX512VLVectorVTInfo _DstVTInfo, + SDNode OpNode, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode, + _SrcVTInfo.info512, _DstVTInfo.info512, + _SrcVTInfo.info512, IsCommutable>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + let Predicates = [HasVLX, prd] in { + defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode, + _SrcVTInfo.info256, _DstVTInfo.info256, + _SrcVTInfo.info256, IsCommutable>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode, + _SrcVTInfo.info128, _DstVTInfo.info128, + _SrcVTInfo.info128, IsCommutable>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + } +} + +defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", + SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF + avx512vl_f32_info, avx512vl_i16_info, + X86cvtne2ps2bf16, HasBF16, 0>, T8XD; + +// Truncate Float to BFloat16 +multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasBF16] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info, + X86cvtneps2bf16, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasBF16, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info, + null_frag, sched.XMM, "{1to4}", "{x}", f128mem, + VK4WM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info, + X86cvtneps2bf16, + sched.YMM, "{1to8}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, + f128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, + f256mem:$src), 0, "intel">; + } +} + +defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16", + SchedWriteCvtPD2PS>, T8XS, + EVEX_CD8<32, CD8VF>; + +let Predicates = [HasBF16, HasVLX] in { + // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), + (VCVTNEPS2BF16Z128rr VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), + (VCVTNEPS2BF16Z128rm addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 + (X86VBroadcast (loadf32 addr:$src))))), + (VCVTNEPS2BF16Z128rmb addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + (v8i16 VR128X:$src0), VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + v8i16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; +} + +let Constraints = "$src1 = $dst" in { +multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo src_v> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + EVEX_4V; + + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (src_v.VT (bitconvert + (src_v.LdFrag addr:$src3)))))>, EVEX_4V; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, + !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr), + (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>, + EVEX_B, EVEX_4V; + +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo src_v, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info512, + src_v.info512>, EVEX_V512; + } + let Predicates = [HasVLX, prd] in { + defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info256, + src_v.info256>, EVEX_V256; + defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info128, + src_v.info128>, EVEX_V128; + } +} + +defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, + avx512vl_f32_info, avx512vl_i32_info, + HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index cb5a4e5b5d41..e52635f8d48b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1,9 +1,8 @@ //===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -195,19 +194,22 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), // Surprisingly enough, these are not two address instructions! let Defs = [EFLAGS] in { +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + // Register-Integer Signed Integer Multiply -def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 - (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), - "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, imm:$src2))]>, - Sched<[WriteIMul16Imm]>, OpSize16; def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, Sched<[WriteIMul16Imm]>, OpSize16; +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))]>, + Sched<[WriteIMul16Imm]>, OpSize16; def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -220,26 +222,20 @@ def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 [(set GR32:$dst, EFLAGS, (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>, Sched<[WriteIMul32Imm]>, OpSize32; -def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 - (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), - "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>, - Sched<[WriteIMul64Imm]>; def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>, Sched<[WriteIMul64Imm]>; +def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>, + Sched<[WriteIMul64Imm]>; // Memory-Integer Signed Integer Multiply -def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 - (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), - "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR16:$dst, EFLAGS, - (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>, - Sched<[WriteIMul16Imm.Folded]>, OpSize16; def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -247,12 +243,12 @@ def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2))]>, Sched<[WriteIMul16Imm.Folded]>, OpSize16; -def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 - (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), - "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, EFLAGS, - (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>, - Sched<[WriteIMul32Imm.Folded]>, OpSize32; +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul16Imm.Folded]>, OpSize16; def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -260,13 +256,12 @@ def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2))]>, Sched<[WriteIMul32Imm.Folded]>, OpSize32; -def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 - (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), - "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR64:$dst, EFLAGS, - (X86smul_flag (loadi64 addr:$src1), - i64immSExt32:$src2))]>, - Sched<[WriteIMul64Imm.Folded]>; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul32Imm.Folded]>, OpSize32; def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -274,6 +269,13 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2))]>, Sched<[WriteIMul64Imm.Folded]>; +def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (loadi64 addr:$src1), + i64immSExt32:$src2))]>, + Sched<[WriteIMul64Imm.Folded]>; } // Defs = [EFLAGS] // unsigned division/remainder @@ -436,11 +438,10 @@ def X86sub_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), // TODO: inc/dec is slow for P4, but fast for Pentium-M. let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { -let CodeSize = 2 in +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "inc{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86add_flag_nocf GR8:$src1, 1))]>; -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86add_flag_nocf GR16:$src1, 1))]>, @@ -484,11 +485,10 @@ let Predicates = [UseIncDec, In64BitMode] in { } // CodeSize = 2, SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { -let CodeSize = 2 in +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "dec{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86sub_flag_nocf GR8:$src1, 1))]>; -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86sub_flag_nocf GR16:$src1, 1))]>, @@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, - Imm8, i8imm, imm8_su, i8imm, invalid_node, + Imm8, i8imm, relocImm8_su, i8imm, invalid_node, 0, OpSizeFixed, 0>; def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, - Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, + Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su, 1, OpSize16, 0>; def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, - Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, + Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, - Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su, + Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su, 1, OpSizeFixed, 1>; /// ITy - This instruction base class takes the type info for the instruction. @@ -924,11 +924,12 @@ class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, string mnemonic, Format RegMRM, Format MemMRM, SDNode opnodeflag, SDNode opnode, - bit CommutableRR, bit ConvertibleToThreeAddress> { + bit CommutableRR, bit ConvertibleToThreeAddress, + bit ConvertibleToThreeAddressRR> { let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR in { - let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in { def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; @@ -1169,16 +1170,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, - X86and_flag, and, 1, 0>; + X86and_flag, and, 1, 0, 0>; defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, - X86or_flag, or, 1, 0>; + X86or_flag, or, 1, 0, 0>; defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, - X86xor_flag, xor, 1, 0>; + X86xor_flag, xor, 1, 0, 0>; defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, - X86add_flag, add, 1, 1>; + X86add_flag, add, 1, 1, 1>; let isCompare = 1 in { defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, - X86sub_flag, sub, 0, 0>; + X86sub_flag, sub, 0, 1, 0>; } // Arithmetic. diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index dcce7b9951f2..50aed98112c3 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -1,9 +1,8 @@ //===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td index f5494fc0b13f..099f6aa8d8bb 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -1,9 +1,8 @@ //===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,99 +13,94 @@ // CMOV instructions. -multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched, - PatLeaf CondNode> { - let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", - isCommutable = 1, SchedRW = [Sched] in { - def NAME#16rr - : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), - [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>, - TB, OpSize16; - def NAME#32rr - : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), - [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>, - TB, OpSize32; - def NAME#64rr - :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), - [(set GR64:$dst, - (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB; - } - - let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", - SchedRW = [Sched.Folded, Sched.ReadAfterFold] in { - def NAME#16rm - : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), - !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), - [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), - CondNode, EFLAGS))]>, TB, OpSize16; - def NAME#32rm - : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), - !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), - [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), - CondNode, EFLAGS))]>, TB, OpSize32; - def NAME#64rm - :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), - !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), - [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), - CondNode, EFLAGS))]>, TB; - } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" -} // end multiclass +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1, SchedRW = [WriteCMOV] in { + def CMOV16rr + : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond), + "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>, + TB, OpSize16; + def CMOV32rr + : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond), + "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>, + TB, OpSize32; + def CMOV64rr + :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond), + "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB; +} +let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { + def CMOV16rm + : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond), + "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + imm:$cond, EFLAGS))]>, TB, OpSize16; + def CMOV32rm + : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond), + "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + imm:$cond, EFLAGS))]>, TB, OpSize32; + def CMOV64rm + :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond), + "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + imm:$cond, EFLAGS))]>, TB; +} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // isCodeGenOnly = 1, ForceDisassemble = 1 -// Conditional Moves. -defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>; -defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>; -defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>; -defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>; -defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>; -defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>; -defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>; -defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>; -defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>; -defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>; -defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>; -defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>; -defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>; -defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>; -defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>; -defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>; +// SetCC instructions. +let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in { + def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond), + "set${cond}\t$dst", + [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>, + TB, Sched<[WriteSETCC]>; + def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond), + "set${cond}\t$dst", + [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>, + TB, Sched<[WriteSETCCStore]>; +} // Uses = [EFLAGS] +multiclass CMOV_SETCC_Aliases<string Cond, int CC> { + def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}", + (CMOV16rr GR16:$dst, GR16:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{w}\t{$src, $dst|$dst, $src}", + (CMOV16rm GR16:$dst, i16mem:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}", + (CMOV32rr GR32:$dst, GR32:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{l}\t{$src, $dst|$dst, $src}", + (CMOV32rm GR32:$dst, i32mem:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}", + (CMOV64rr GR64:$dst, GR64:$src, CC), 0>; + def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}", + (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>; -// SetCC instructions. -multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { - let Uses = [EFLAGS] in { - def r : I<opc, MRMXr, (outs GR8:$dst), (ins), - !strconcat(Mnemonic, "\t$dst"), - [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>, - TB, Sched<[WriteSETCC]>; - def m : I<opc, MRMXm, (outs), (ins i8mem:$dst), - !strconcat(Mnemonic, "\t$dst"), - [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>, - TB, Sched<[WriteSETCCStore]>; - } // Uses = [EFLAGS] + def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>; + def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>; } -defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set -defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set -defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than -defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal -defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to -defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to -defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal -defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than -defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set -defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed -defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set -defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set -defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than -defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal -defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal -defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than +defm : CMOV_SETCC_Aliases<"o" , 0>; +defm : CMOV_SETCC_Aliases<"no", 1>; +defm : CMOV_SETCC_Aliases<"b" , 2>; +defm : CMOV_SETCC_Aliases<"ae", 3>; +defm : CMOV_SETCC_Aliases<"e" , 4>; +defm : CMOV_SETCC_Aliases<"ne", 5>; +defm : CMOV_SETCC_Aliases<"be", 6>; +defm : CMOV_SETCC_Aliases<"a" , 7>; +defm : CMOV_SETCC_Aliases<"s" , 8>; +defm : CMOV_SETCC_Aliases<"ns", 9>; +defm : CMOV_SETCC_Aliases<"p" , 10>; +defm : CMOV_SETCC_Aliases<"np", 11>; +defm : CMOV_SETCC_Aliases<"l" , 12>; +defm : CMOV_SETCC_Aliases<"ge", 13>; +defm : CMOV_SETCC_Aliases<"le", 14>; +defm : CMOV_SETCC_Aliases<"g" , 15>; // SALC is an undocumented instruction. Information for this instruction can be found // here http://www.rcollins.org/secrets/opcodes/SALC.html diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 394dca8e7817..efaccdc9ee96 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1,9 +1,8 @@ //===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,11 +19,6 @@ def GetLo32XForm : SDNodeXForm<imm, [{ return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N)); }]>; -def GetLo8XForm : SDNodeXForm<imm, [{ - // Transformation function: get the low 8 bits. - return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); -}]>; - //===----------------------------------------------------------------------===// // Random Pseudo Instructions. @@ -360,7 +354,7 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), // this happens, it is great. However, if we are left with an 8-bit sbb and an // and, we might as well just match it as a setb. def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), - (SETBr)>; + (SETCCr (i8 2))>; // Patterns to give priority when both inputs are zero so that we don't use // an immediate for the RHS. @@ -574,8 +568,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; - defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; - defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + let Predicates = [NoAVX512] in { + defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + } + let Predicates = [HasAVX512] in { + defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>; + defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>; + } let Predicates = [NoVLX] in { defm _VR128 : CMOVrr_PSEUDO<VR128, v2i64>; defm _VR256 : CMOVrr_PSEUDO<VR256, v4i64>; @@ -712,6 +712,32 @@ def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, "{$src2, $dst|$dst, $src2}"), [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK; +// NOTE: These are order specific, we want the mi8 forms to be listed +// first so that they are slightly preferred to the mi forms. +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>, + OpSize16, LOCK; + +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>, + OpSize32, LOCK; + +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>, + LOCK; + def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), @@ -742,30 +768,6 @@ def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, "{$src2, $dst|$dst, $src2}"), [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>, LOCK; - -def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), - !strconcat(mnemonic, "{w}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>, - OpSize16, LOCK; - -def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), - !strconcat(mnemonic, "{l}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>, - OpSize32, LOCK; - -def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, - ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), - !strconcat(mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>, - LOCK; } } @@ -868,7 +870,7 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - SchedRW = [WriteCMPXCHGRMW] in { + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in { defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; } @@ -892,8 +894,9 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; // the instruction and we are sure we will have a valid register to restore // the value of RBX. let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX], - SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, - Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in { + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst", + usesCustomInserter = 1 in { def LCMPXCHG8B_SAVE_EBX : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save), @@ -904,14 +907,14 @@ def LCMPXCHG8B_SAVE_EBX : let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW] in { + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in { defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", X86cas16, i128mem>, REX_W; } // Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant. let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b], SchedRW = [WriteCMPXCHGRMW], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", usesCustomInserter = 1 in { def LCMPXCHG16B_SAVE_RBX : @@ -1001,28 +1004,31 @@ defm : RELEASE_BINOP_MI<"OR", or>; defm : RELEASE_BINOP_MI<"XOR", xor>; defm : RELEASE_BINOP_MI<"SUB", sub>; -// Same as above, but for floating-point. -// FIXME: imm version. -// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// Atomic load + floating point patterns. // FIXME: This could also handle SIMD operations with *ps and *pd instructions. -let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in { -multiclass RELEASE_FP_BINOP_MI<SDNode op> { - def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), - "#BINOP "#NAME#"32mr PSEUDO!", - [(atomic_store_32 addr:$dst, - (i32 (bitconvert (op - (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), - FR32:$src))))]>, Requires<[HasSSE1]>; - def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), - "#BINOP "#NAME#"64mr PSEUDO!", - [(atomic_store_64 addr:$dst, - (i64 (bitconvert (op - (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), - FR64:$src))))]>, Requires<[HasSSE2]>; +multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> { + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast<Instruction>(Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR32:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast<Instruction>("V"#Name#"SSrm") FR32:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR32X:$src1, (bitconvert (i32 (atomic_load_32 addr:$src2)))), + (!cast<Instruction>("V"#Name#"SSZrm") FR32X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; + + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast<Instruction>(Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseSSE1]>; + def : Pat<(op FR64:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast<Instruction>("V"#Name#"SDrm") FR64:$src1, addr:$src2)>, + Requires<[UseAVX]>; + def : Pat<(op FR64X:$src1, (bitconvert (i64 (atomic_load_64 addr:$src2)))), + (!cast<Instruction>("V"#Name#"SDZrm") FR64X:$src1, addr:$src2)>, + Requires<[HasAVX512]>; } -defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>; // FIXME: Add fsub, fmul, fdiv, ... -} multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32, dag dag64> { @@ -1083,6 +1089,35 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>; def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>; def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>; +// Floating point loads/stores. +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSmr addr:$dst, FR32:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_32 addr:$dst, (i32 (bitconvert (f32 FR32:$src)))), + (VMOVSSZmr addr:$dst, FR32:$src)>, Requires<[HasAVX512]>; + +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (MOVSDmr addr:$dst, FR64:$src)>, Requires<[UseSSE2]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[UseAVX]>; +def : Pat<(atomic_store_64 addr:$dst, (i64 (bitconvert (f64 FR64:$src)))), + (VMOVSDmr addr:$dst, FR64:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (MOVSSrm_alt addr:$src)>, Requires<[UseSSE1]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSrm_alt addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f32 (bitconvert (i32 (atomic_load_32 addr:$src)))), + (VMOVSSZrm_alt addr:$src)>, Requires<[HasAVX512]>; + +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (MOVSDrm_alt addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDrm_alt addr:$src)>, Requires<[UseAVX]>; +def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), + (VMOVSDZrm_alt addr:$src)>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules //===----------------------------------------------------------------------===// @@ -1241,37 +1276,23 @@ def : Pat<(X86cmp GR32:$src1, 0), def : Pat<(X86cmp GR64:$src1, 0), (TEST64rr GR64:$src1, GR64:$src1)>; +def inv_cond_XFORM : SDNodeXForm<imm, [{ + X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue()); + return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), + SDLoc(N), MVT::i8); +}]>; + // Conditional moves with folded loads with operands swapped and conditions // inverted. -multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, - Instruction Inst64> { - let Predicates = [HasCMov] in { - def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), - (Inst16 GR16:$src2, addr:$src1)>; - def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), - (Inst32 GR32:$src2, addr:$src1)>; - def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), - (Inst64 GR64:$src2, addr:$src1)>; - } +let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS), + (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS), + (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS), + (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; } -defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; -defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; -defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; -defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; -defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; -defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; -defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; -defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; -defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; -defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; -defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; -defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; -defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; -defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; -defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; -defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; - // zextload bool -> zextload byte // i1 stored in one byte in zero-extended form. // Upper bits cleanup should be executed before Store. @@ -1298,14 +1319,16 @@ def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; // For other extloads, use subregs, since the high contents of the register are // defined after an extload. +// NOTE: The extloadi64i32 pattern needs to be first as it will try to form +// 32-bit loads for 4 byte aligned i8/i16 loads. +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; def : Pat<(extloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i8 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i16 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; -def : Pat<(extloadi64i32 addr:$src), - (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. @@ -1351,6 +1374,8 @@ def def32 : PatLeaf<(i32 GR32:$src), [{ // we can use a SUBREG_TO_REG. def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; +def : Pat<(i64 (and (anyext def32:$src), 0x00000000FFFFFFFF)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; //===----------------------------------------------------------------------===// // Pattern match OR as ADD @@ -1377,9 +1402,12 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ // Try this before the selecting to OR. let SchedRW = [WriteALU] in { -let isConvertibleToThreeAddress = 1, +let isConvertibleToThreeAddress = 1, isPseudo = 1, Constraints = "$src1 = $dst", Defs = [EFLAGS] in { let isCommutable = 1 in { +def ADD8rr_DB : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2), + "", // orb/addb REG, REG + [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>; def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "", // orw/addw REG, REG [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; @@ -1394,6 +1422,10 @@ def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. +def ADD8ri_DB : I<0, Pseudo, + (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), + "", // orb/addb REG, imm8 + [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>; def ADD16ri8_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "", // orw/addw REG, imm8 @@ -1483,6 +1515,13 @@ def : Pat<(add GR64:$src1, 128), def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), (SUB64mi8 addr:$dst, -128)>; +def : Pat<(X86add_flag_nocf GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(X86add_flag_nocf GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(X86add_flag_nocf GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; + // The same trick applies for 32-bit immediate fields in 64-bit // instructions. def : Pat<(add GR64:$src1, 0x0000000080000000), @@ -1490,6 +1529,9 @@ def : Pat<(add GR64:$src1, 0x0000000080000000), def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst), (SUB64mi32 addr:$dst, 0xffffffff80000000)>; +def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; + // To avoid needing to materialize an immediate in a register, use a 32-bit and // with implicit zero-extension instead of a 64-bit and if the immediate has at // least 32 bits of leading zeros. If in addition the last 32 bits can be @@ -1504,7 +1546,7 @@ def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), (i64 0), (AND32ri8 (EXTRACT_SUBREG GR64:$src, sub_32bit), - (i32 (GetLo8XForm imm:$imm))), + (i32 (GetLo32XForm imm:$imm))), sub_32bit)>; def : Pat<(and GR64:$src, i64immZExt32:$imm), @@ -1714,40 +1756,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; -// Helper imms to check if a mask doesn't change significant shift/rotate bits. -def immShift8 : ImmLeaf<i8, [{ - return countTrailingOnes<uint64_t>(Imm) >= 3; +def shiftMask8 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 3); }]>; -def immShift16 : ImmLeaf<i8, [{ - return countTrailingOnes<uint64_t>(Imm) >= 4; + +def shiftMask16 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 4); }]>; -def immShift32 : ImmLeaf<i8, [{ - return countTrailingOnes<uint64_t>(Imm) >= 5; + +def shiftMask32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 5); }]>; -def immShift64 : ImmLeaf<i8, [{ - return countTrailingOnes<uint64_t>(Imm) >= 6; + +def shiftMask64 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{ + return isUnneededShiftMask(N, 6); }]>; + // Shift amount is implicitly masked. multiclass MaskedShiftAmountPats<SDNode frag, string name> { // (shift x (and y, 31)) ==> (shift x, y) - def : Pat<(frag GR8:$src1, (and CL, immShift32)), + def : Pat<(frag GR8:$src1, (shiftMask32 CL)), (!cast<Instruction>(name # "8rCL") GR8:$src1)>; - def : Pat<(frag GR16:$src1, (and CL, immShift32)), + def : Pat<(frag GR16:$src1, (shiftMask32 CL)), (!cast<Instruction>(name # "16rCL") GR16:$src1)>; - def : Pat<(frag GR32:$src1, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, (shiftMask32 CL)), (!cast<Instruction>(name # "32rCL") GR32:$src1)>; - def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast<Instruction>(name # "8mCL") addr:$dst)>; - def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast<Instruction>(name # "16mCL") addr:$dst)>; - def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast<Instruction>(name # "32mCL") addr:$dst)>; // (shift x (and y, 63)) ==> (shift x, y) - def : Pat<(frag GR64:$src1, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, (shiftMask64 CL)), (!cast<Instruction>(name # "64rCL") GR64:$src1)>; - def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst), (!cast<Instruction>(name # "64mCL") addr:$dst)>; } @@ -1763,23 +1808,23 @@ defm : MaskedShiftAmountPats<sra, "SAR">; // not tracking flags for these nodes. multiclass MaskedRotateAmountPats<SDNode frag, string name> { // (rot x (and y, BitWidth - 1)) ==> (rot x, y) - def : Pat<(frag GR8:$src1, (and CL, immShift8)), + def : Pat<(frag GR8:$src1, (shiftMask8 CL)), (!cast<Instruction>(name # "8rCL") GR8:$src1)>; - def : Pat<(frag GR16:$src1, (and CL, immShift16)), + def : Pat<(frag GR16:$src1, (shiftMask16 CL)), (!cast<Instruction>(name # "16rCL") GR16:$src1)>; - def : Pat<(frag GR32:$src1, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, (shiftMask32 CL)), (!cast<Instruction>(name # "32rCL") GR32:$src1)>; - def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst), + def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst), (!cast<Instruction>(name # "8mCL") addr:$dst)>; - def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst), + def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst), (!cast<Instruction>(name # "16mCL") addr:$dst)>; - def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst), (!cast<Instruction>(name # "32mCL") addr:$dst)>; // (rot x (and y, 63)) ==> (rot x, y) - def : Pat<(frag GR64:$src1, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, (shiftMask64 CL)), (!cast<Instruction>(name # "64rCL") GR64:$src1)>; - def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst), (!cast<Instruction>(name # "64mCL") addr:$dst)>; } @@ -1790,13 +1835,13 @@ defm : MaskedRotateAmountPats<rotr, "ROR">; // Double shift amount is implicitly masked. multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { // (shift x (and y, 31)) ==> (shift x, y) - def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)), + def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)), (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>; - def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)), + def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)), (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>; // (shift x (and y, 63)) ==> (shift x, y) - def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)), + def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)), (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>; } @@ -1805,57 +1850,57 @@ defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">; let Predicates = [HasBMI2] in { let AddedComplexity = 1 in { - def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(sra GR32:$src1, (shiftMask32 GR8:$src2)), (SARX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(sra GR64:$src1, (shiftMask64 GR8:$src2)), (SARX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(srl GR32:$src1, (shiftMask32 GR8:$src2)), (SHRX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(srl GR64:$src1, (shiftMask64 GR8:$src2)), (SHRX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)), + def : Pat<(shl GR32:$src1, (shiftMask32 GR8:$src2)), (SHLX32rr GR32:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)), + def : Pat<(shl GR64:$src1, (shiftMask64 GR8:$src2)), (SHLX64rr GR64:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } - def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(sra (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SARX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(sra (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SARX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(srl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SHRX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(srl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SHRX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + def : Pat<(shl (loadi32 addr:$src1), (shiftMask32 GR8:$src2)), (SHLX32rm addr:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + def : Pat<(shl (loadi64 addr:$src1), (shiftMask64 GR8:$src2)), (SHLX64rm addr:$src1, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; @@ -1864,7 +1909,7 @@ let Predicates = [HasBMI2] in { // Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location. multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR, Instruction BTS, Instruction BTC, - ImmLeaf ImmShift> { + PatFrag ShiftMask> { def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)), (BTR RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; @@ -1876,20 +1921,20 @@ multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; // Similar to above, but removing unneeded masking of the shift amount. - def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))), + def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))), (BTR RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))), (BTS RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; - def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))), (BTC RC:$src1, (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } -defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>; -defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>; -defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>; +defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>; +defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>; +defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>; // (anyext (setcc_carry)) -> (setcc_carry) @@ -1974,8 +2019,6 @@ def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; // sub reg, relocImm def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2), (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; -def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2), - (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index a7c7aaab2285..f82e80965b7c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -1,9 +1,8 @@ //===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -71,35 +70,40 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { } // Conditional Branches. -let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { - multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { - def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)]>; - let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { - def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm, - []>, OpSize16, TB; - def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm, - []>, TB, OpSize32; - } +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump], + isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs), + (ins brtarget8:$dst, ccode:$cond), + "j${cond}\t$dst", + [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>; + let hasSideEffects = 0 in { + def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs), + (ins brtarget16:$dst, ccode:$cond), + "j${cond}\t$dst", + []>, OpSize16, TB; + def JCC_4 : Ii32PCRel<0x80, AddCCFrm, (outs), + (ins brtarget32:$dst, ccode:$cond), + "j${cond}\t$dst", + []>, TB, OpSize32; } } -defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; -defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; -defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; -defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; -defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; -defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; -defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; -defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; -defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; -defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; -defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; -defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; -defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; -defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; -defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; -defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; +def : InstAlias<"jo\t$dst", (JCC_1 brtarget8:$dst, 0), 0>; +def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst, 1), 0>; +def : InstAlias<"jb\t$dst", (JCC_1 brtarget8:$dst, 2), 0>; +def : InstAlias<"jae\t$dst", (JCC_1 brtarget8:$dst, 3), 0>; +def : InstAlias<"je\t$dst", (JCC_1 brtarget8:$dst, 4), 0>; +def : InstAlias<"jne\t$dst", (JCC_1 brtarget8:$dst, 5), 0>; +def : InstAlias<"jbe\t$dst", (JCC_1 brtarget8:$dst, 6), 0>; +def : InstAlias<"ja\t$dst", (JCC_1 brtarget8:$dst, 7), 0>; +def : InstAlias<"js\t$dst", (JCC_1 brtarget8:$dst, 8), 0>; +def : InstAlias<"jns\t$dst", (JCC_1 brtarget8:$dst, 9), 0>; +def : InstAlias<"jp\t$dst", (JCC_1 brtarget8:$dst, 10), 0>; +def : InstAlias<"jnp\t$dst", (JCC_1 brtarget8:$dst, 11), 0>; +def : InstAlias<"jl\t$dst", (JCC_1 brtarget8:$dst, 12), 0>; +def : InstAlias<"jge\t$dst", (JCC_1 brtarget8:$dst, 13), 0>; +def : InstAlias<"jle\t$dst", (JCC_1 brtarget8:$dst, 14), 0>; +def : InstAlias<"jg\t$dst", (JCC_1 brtarget8:$dst, 15), 0>; // jcx/jecx/jrcx instructions. let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in { diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index c24d6d5b8df1..06e605fe5db2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -1,9 +1,8 @@ //===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,11 +28,11 @@ let hasSideEffects = 0 in { let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", []>, Sched<[WriteALU]>; + "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) def CQO : RI<0x99, RawFrm, (outs), (ins), - "{cqto|cqo}", []>, Sched<[WriteALU]>; + "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; } // Sign/Zero extenders diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index 1a8e529431af..0cca71bdc431 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -1,9 +1,8 @@ //===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -237,7 +236,8 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, @@ -263,8 +263,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, // the lowest element of the FMA*_Int instruction. Even though such analysis // may be not implemented yet we allow the routines doing the actual commute // transformation to decide if one or another instruction is commutable or not. -let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, - hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memopr, RegisterClass RC, X86FoldableSchedWrite sched> { diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index def732a2dd00..25bbdddb7a21 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -1,9 +1,8 @@ //===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,7 +56,7 @@ using namespace llvm; #define FMA3GROUP_SCALAR(Name, Attrs) \ FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \ - FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) \ + FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs) #define FMA3GROUP_FULL(Name, Attrs) \ FMA3GROUP_PACKED(Name, Attrs) \ @@ -159,11 +158,9 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) { // FMA 231 instructions have an opcode of 0xB6-0xBF unsigned FormIndex = ((BaseOpcode - 0x90) >> 4) & 0x3; - auto I = std::lower_bound(Table.begin(), Table.end(), Opcode, - [FormIndex](const X86InstrFMA3Group &Group, - unsigned Opcode) { - return Group.Opcodes[FormIndex] < Opcode; - }); + auto I = partition_point(Table, [=](const X86InstrFMA3Group &Group) { + return Group.Opcodes[FormIndex] < Opcode; + }); assert(I != Table.end() && I->Opcodes[FormIndex] == Opcode && "Couldn't find FMA3 opcode!"); return I; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h index 6eec1db98bf8..7fa6f5917862 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h @@ -1,9 +1,8 @@ //===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 8e12efff77ea..2ec6d50f9702 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -1,9 +1,8 @@ //===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,18 +16,13 @@ // FPStack specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, - SDTCisVT<1, f80>]>; -def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; -def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; -def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, - SDTCisVT<2, OtherVT>]>; +def SDTX86Fld : SDTypeProfile<1, 1, [SDTCisFP<0>, + SDTCisPtrTy<1>]>; +def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>]>; +def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>; +def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; -def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -42,17 +36,71 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; +def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; -def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32; +}]>; +def X86fstf64 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64; +}]>; +def X86fstf80 : PatFrag<(ops node:$val, node:$ptr), + (X86fst node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80; +}]>; + +def X86fldf32 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f32; +}]>; +def X86fldf64 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f64; +}]>; +def X86fldf80 : PatFrag<(ops node:$ptr), (X86fld node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::f80; +}]>; + +def X86fild16 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; +def X86fild32 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; +def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fist64 : PatFrag<(ops node:$val, node:$ptr), + (X86fist node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + +def X86fp_to_i16mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; +def X86fp_to_i32mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; +def X86fp_to_i64mem : PatFrag<(ops node:$val, node:$ptr), + (X86fp_to_mem node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + //===----------------------------------------------------------------------===// // FPStack pattern fragments //===----------------------------------------------------------------------===// @@ -74,7 +122,9 @@ def fpimmneg1 : FPImmLeaf<fAny, [{ }]>; // Some 'special' instructions - expanded after instruction selection. -let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { +// Clobbers EFLAGS due to OR instruction used internally. +// FIXME: Can we model this in SelectionDAG? +let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in { def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), @@ -139,7 +189,6 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // These instructions cannot address 80-bit memory. multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, bit Forward = 1> { -let mayLoad = 1, hasSideEffects = 1 in { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, @@ -176,8 +225,10 @@ def _Fp80m64: FpI_<(outs RFP80:$dst), (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), (set RFP80:$dst, (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] @@ -185,49 +236,50 @@ def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, - (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP32:$src1, (X86fild16 addr:$src2))), (set RFP32:$dst, - (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, - (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP32:$src1, (X86fild32 addr:$src2))), (set RFP32:$dst, - (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, - (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP64:$src1, (X86fild16 addr:$src2))), (set RFP64:$dst, - (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, - (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP64:$src1, (X86fild32 addr:$src2))), (set RFP64:$dst, - (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, - (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (OpNode RFP80:$src1, (X86fild16 addr:$src2))), (set RFP80:$dst, - (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; + (OpNode (X86fild16 addr:$src2), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, - (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (OpNode RFP80:$src1, (X86fild32 addr:$src2))), (set RFP80:$dst, - (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; + (OpNode (X86fild32 addr:$src2), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), !strconcat("fi", asmstring, "{l}\t$src")>; -} // mayLoad = 1, hasSideEffects = 1 } let Defs = [FPSW], Uses = [FPCW] in { @@ -418,12 +470,11 @@ def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op), } // SchedRW // Floating point loads & stores. -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteLoad], Uses = [FPCW] in { let canFoldAsLoad = 1 in { def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP32:$dst, (loadf32 addr:$src))]>; -let isReMaterializable = 1 in - def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, +def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, [(set RFP64:$dst, (loadf64 addr:$src))]>; def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, [(set RFP80:$dst, (loadf80 addr:$src))]>; @@ -435,23 +486,23 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i16))]>; + [(set RFP32:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i32))]>; + [(set RFP32:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP32:$dst, (X86fild addr:$src, i64))]>; + [(set RFP32:$dst, (X86fild64 addr:$src))]>; def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i16))]>; + [(set RFP64:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i32))]>; + [(set RFP64:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP64:$dst, (X86fild addr:$src, i64))]>; + [(set RFP64:$dst, (X86fild64 addr:$src))]>; def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i16))]>; + [(set RFP80:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i32))]>; + [(set RFP80:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, - [(set RFP80:$dst, (X86fild addr:$src, i64))]>; + [(set RFP80:$dst, (X86fild64 addr:$src))]>; } // SchedRW let SchedRW = [WriteStore], Uses = [FPCW] in { @@ -491,7 +542,7 @@ def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } // mayStore } // SchedRW, Uses = [FPCW] -let mayLoad = 1, SchedRW = [WriteLoad] in { +let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; @@ -541,7 +592,7 @@ def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst"> } // FP Stack manipulation instructions. -let SchedRW = [WriteMove] in { +let SchedRW = [WriteMove], Uses = [FPCW] in { def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">; def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">; def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">; @@ -549,7 +600,7 @@ def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">; } // Floating point constant loads. -let isReMaterializable = 1, SchedRW = [WriteZero] in { +let SchedRW = [WriteZero], Uses = [FPCW] in { def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, [(set RFP32:$dst, fpimm0)]>; def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, @@ -564,13 +615,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } -let SchedRW = [WriteFLD0] in +let SchedRW = [WriteFLD0], Uses = [FPCW] in def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">; -let SchedRW = [WriteFLD1] in +let SchedRW = [WriteFLD1], Uses = [FPCW] in def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">; -let SchedRW = [WriteFLDC] in { +let SchedRW = [WriteFLDC], Uses = [FPCW] in { def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>; def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>; @@ -695,21 +746,17 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), //===----------------------------------------------------------------------===// // Required for RET of f32 / f64 / f80 values. -def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; -def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; -def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; +def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>; +def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>; +def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. -def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, - RFP64:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, - RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, - RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, - RFP80:$src)>; +def : Pat<(X86fstf32 RFP32:$src, addr:$op), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fstf32 RFP64:$src, addr:$op), (ST_Fp64m32 addr:$op, RFP64:$src)>; +def : Pat<(X86fstf64 RFP64:$src, addr:$op), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fstf32 RFP80:$src, addr:$op), (ST_Fp80m32 addr:$op, RFP80:$src)>; +def : Pat<(X86fstf64 RFP80:$src, addr:$op), (ST_Fp80m64 addr:$op, RFP80:$src)>; +def : Pat<(X86fstf80 RFP80:$src, addr:$op), (ST_FpP80m addr:$op, RFP80:$src)>; // Floating point constant -0.0 and -1.0 def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; @@ -720,7 +767,11 @@ def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; // Used to conv. i64 to f64 since there isn't a SSE version. -def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; +def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>; + +// Used to conv. between f80 and i64 for i64 atomic loads. +def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>; +def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>; // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 7d31cfab4137..d42fec3770c7 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1,9 +1,8 @@ //===-- X86InstrFoldTables.cpp - X86 Instruction Folding Tables -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,6 +33,17 @@ using namespace llvm; // tables that would be incorrect. The manual review process allows us a chance // to catch these before they become observable bugs. static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { + { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, + { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, + { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, + { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, + { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, + { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, + { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, + { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, + { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, + { X86::ADD8ri_DB, X86::ADD8mi, TB_NO_REVERSE }, + { X86::ADD8rr_DB, X86::ADD8mr, TB_NO_REVERSE }, { X86::ADC16ri, X86::ADC16mi, 0 }, { X86::ADC16ri8, X86::ADC16mi8, 0 }, { X86::ADC16rr, X86::ADC16mr, 0 }, @@ -48,22 +58,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC8rr, X86::ADC8mr, 0 }, { X86::ADD16ri, X86::ADD16mi, 0 }, { X86::ADD16ri8, X86::ADD16mi8, 0 }, - { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, - { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, { X86::ADD16rr, X86::ADD16mr, 0 }, - { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, { X86::ADD32ri, X86::ADD32mi, 0 }, { X86::ADD32ri8, X86::ADD32mi8, 0 }, - { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, - { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32mr, 0 }, - { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, { X86::ADD64ri32, X86::ADD64mi32, 0 }, - { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, { X86::ADD64ri8, X86::ADD64mi8, 0 }, - { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64mr, 0 }, - { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, { X86::ADD8ri, X86::ADD8mi, 0 }, { X86::ADD8ri8, X86::ADD8mi8, 0 }, { X86::ADD8rr, X86::ADD8mr, 0 }, @@ -247,7 +248,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::XOR64rr, X86::XOR64mr, 0 }, { X86::XOR8ri, X86::XOR8mi, 0 }, { X86::XOR8ri8, X86::XOR8mi8, 0 }, - { X86::XOR8rr, X86::XOR8mr, 0 } + { X86::XOR8rr, X86::XOR8mr, 0 }, }; static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { @@ -305,9 +306,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, - { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE }, - { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, - { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, + { X86::MOVPQIto64rr, X86::MOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOVSDto64rr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOVSS2DIrr, X86::MOVSSmr, TB_FOLDED_STORE }, { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, @@ -321,22 +322,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, - { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, - { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, - { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, - { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, - { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, - { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, - { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, - { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, - { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, - { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, - { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, - { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, - { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, - { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, - { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, - { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, + { X86::SETCCr, X86::SETCCm, TB_FOLDED_STORE }, { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, @@ -403,12 +389,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIrr, X86::VMOVPDI2DImr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE }, - { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE }, - { X86::VMOVSDto64rr, X86::VMOVSDto64mr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSDto64Zrr, X86::VMOVSDZmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSDto64rr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVSS2DIZrr, X86::VMOVSSZmr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIrr, X86::VMOVSSmr, TB_FOLDED_STORE }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, @@ -544,14 +530,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::MOV16rr, X86::MOV16rm, 0 }, { X86::MOV32rr, X86::MOV32rm, 0 }, { X86::MOV64rr, X86::MOV64rm, 0 }, - { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, - { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, + { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, + { X86::MOV64toSDrr, X86::MOVSDrm_alt, TB_NO_REVERSE }, { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, - { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, + { X86::MOVDI2SSrr, X86::MOVSSrm_alt, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, @@ -628,7 +614,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::SQRTSSr, X86::SQRTSSm, 0 }, { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, - // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, @@ -663,7 +648,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE }, { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE }, - { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 }, + { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 }, { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 }, { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, @@ -671,6 +656,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0 }, { X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, + { X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0 }, + { X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0 }, + { X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2DQZ128rr, X86::VCVTPD2DQZ128rm, 0 }, { X86::VCVTPD2DQZ256rr, X86::VCVTPD2DQZ256rm, 0 }, @@ -830,10 +818,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 }, { X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 }, { X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 }, - { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, - { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, - { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, + { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, + { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE }, + { X86::VMOV64toSDZrr, X86::VMOVSDZrm_alt, TB_NO_REVERSE }, + { X86::VMOV64toSDrr, X86::VMOVSDrm_alt, TB_NO_REVERSE }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, @@ -851,8 +839,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, - { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, + { X86::VMOVDI2SSZrr, X86::VMOVSSZrm_alt, 0 }, + { X86::VMOVDI2SSrr, X86::VMOVSSrm_alt, 0 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, @@ -1206,6 +1194,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { }; static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { + { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, + { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, + { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, + { X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE }, { X86::ADC16rr, X86::ADC16rm, 0 }, { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, @@ -1213,11 +1205,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::ADCX32rr, X86::ADCX32rm, 0 }, { X86::ADCX64rr, X86::ADCX64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, - { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32rm, 0 }, - { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64rm, 0 }, - { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, { X86::ADD8rr, X86::ADD8rm, 0 }, { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, @@ -1247,54 +1236,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, - { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, - { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, - { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, - { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, - { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, - { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, - { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, - { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, - { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, - { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, - { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, - { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, - { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, - { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, - { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, - { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, - { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, - { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, - { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, - { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, - { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, - { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, - { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, - { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, - { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, - { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, - { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, - { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, - { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, - { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, - { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, - { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, - { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, - { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, - { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, - { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, - { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, - { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, - { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, - { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, - { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, - { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, - { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, - { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, - { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, - { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, - { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, - { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, + { X86::CMOV16rr, X86::CMOV16rm, 0 }, + { X86::CMOV32rr, X86::CMOV32rm, 0 }, + { X86::CMOV64rr, X86::CMOV64rm, 0 }, { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, { X86::CMPSDrr, X86::CMPSDrm, 0 }, @@ -1421,6 +1365,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE }, { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, + { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, @@ -1576,7 +1521,6 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST *mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, @@ -1697,6 +1641,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0 }, { X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0 }, { X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0 }, + { X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0 }, + { X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0 }, + { X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0 }, + { X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0 }, + { X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0 }, + { X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0 }, { X86::VCVTPD2DQZ128rrkz, X86::VCVTPD2DQZ128rmkz, 0 }, { X86::VCVTPD2DQZ256rrkz, X86::VCVTPD2DQZ256rmkz, 0 }, { X86::VCVTPD2DQZrrkz, X86::VCVTPD2DQZrmkz, 0 }, @@ -2030,6 +1980,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMOVDQU8Zrrkz, X86::VMOVDQU8Zrmkz, TB_NO_REVERSE }, { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE }, { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, + { X86::VMOVSDZrr, X86::VMOVLPDZ128rm, TB_NO_REVERSE }, + { X86::VMOVSDrr, X86::VMOVLPDrm, TB_NO_REVERSE }, { X86::VMOVSHDUPZ128rrkz, X86::VMOVSHDUPZ128rmkz, 0 }, { X86::VMOVSHDUPZ256rrkz, X86::VMOVSHDUPZ256rmkz, 0 }, { X86::VMOVSHDUPZrrkz, X86::VMOVSHDUPZrmkz, 0 }, @@ -2072,6 +2024,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, { X86::VORPSZrr, X86::VORPSZrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, + { X86::VP2INTERSECTDZ128rr, X86::VP2INTERSECTDZ128rm, 0 }, + { X86::VP2INTERSECTDZ256rr, X86::VP2INTERSECTDZ256rm, 0 }, + { X86::VP2INTERSECTDZrr, X86::VP2INTERSECTDZrm, 0 }, + { X86::VP2INTERSECTQZ128rr, X86::VP2INTERSECTQZ128rm, 0 }, + { X86::VP2INTERSECTQZ256rr, X86::VP2INTERSECTQZ256rm, 0 }, + { X86::VP2INTERSECTQZrr, X86::VP2INTERSECTQZrm, 0 }, { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 }, { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 }, { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, @@ -3074,6 +3032,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0 }, { X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0 }, { X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0 }, + { X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0 }, + { X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0 }, + { X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0 }, + { X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0 }, + { X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0 }, + { X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0 }, { X86::VCVTPD2DQZ128rrk, X86::VCVTPD2DQZ128rmk, 0 }, { X86::VCVTPD2DQZ256rrk, X86::VCVTPD2DQZ256rmk, 0 }, { X86::VCVTPD2DQZrrk, X86::VCVTPD2DQZrmk, 0 }, @@ -3162,6 +3126,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, + { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 }, + { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 }, + { X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0 }, { X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0 }, { X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0 }, { X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE }, @@ -4376,6 +4343,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, + { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 }, + { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 }, + { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 }, { X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE }, { X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE }, { X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 }, @@ -4389,6 +4359,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, + { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 }, + { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 }, + { X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0 }, + { X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0 }, + { X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0 }, + { X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0 }, { X86::VFIXUPIMMPDZ128rrik, X86::VFIXUPIMMPDZ128rmik, 0 }, { X86::VFIXUPIMMPDZ128rrikz, X86::VFIXUPIMMPDZ128rmikz, 0 }, { X86::VFIXUPIMMPDZ256rrik, X86::VFIXUPIMMPDZ256rmik, 0 }, @@ -5315,9 +5291,7 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { } #endif - const X86MemoryFoldTableEntry *Data = std::lower_bound(Table.begin(), - Table.end(), - RegOp); + const X86MemoryFoldTableEntry *Data = llvm::lower_bound(Table, RegOp); if (Data != Table.end() && Data->KeyOp == RegOp && !(Data->Flags & TB_NO_FORWARD)) return Data; @@ -5404,7 +5378,7 @@ static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable; const X86MemoryFoldTableEntry * llvm::lookupUnfoldTable(unsigned MemOp) { auto &Table = MemUnfoldTable->Table; - auto I = std::lower_bound(Table.begin(), Table.end(), MemOp); + auto I = llvm::lower_bound(Table, MemOp); if (I != Table.end() && I->KeyOp == MemOp) return &*I; return nullptr; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h index 90016baead96..419baf98f61d 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h +++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.h @@ -1,9 +1,8 @@ //===-- X86InstrFoldTables.h - X86 Instruction Folding Tables ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index 47d4719d3060..e8f0d937dff4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -1,9 +1,8 @@ //===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -27,10 +26,13 @@ def RawFrmDst : Format<5>; def RawFrmDstSrc : Format<6>; def RawFrmImm8 : Format<7>; def RawFrmImm16 : Format<8>; +def AddCCFrm : Format<9>; def MRMDestMem : Format<32>; def MRMSrcMem : Format<33>; def MRMSrcMem4VOp3 : Format<34>; def MRMSrcMemOp4 : Format<35>; +def MRMSrcMemCC : Format<36>; +def MRMXmCC: Format<38>; def MRMXm : Format<39>; def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>; def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>; @@ -39,6 +41,8 @@ def MRMDestReg : Format<48>; def MRMSrcReg : Format<49>; def MRMSrcReg4VOp3 : Format<50>; def MRMSrcRegOp4 : Format<51>; +def MRMSrcRegCC : Format<52>; +def MRMXrCC: Format<54>; def MRMXr : Format<55>; def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>; def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>; @@ -206,13 +210,10 @@ class TAPS : TA { Prefix OpPrefix = PS; } class TAPD : TA { Prefix OpPrefix = PD; } class TAXD : TA { Prefix OpPrefix = XD; } class VEX { Encoding OpEnc = EncVEX; } -class VEX_W { bits<2> VEX_WPrefix = 1; } -class VEX_WIG { bits<2> VEX_WPrefix = 2; } +class VEX_W { bit HasVEX_W = 1; } +class VEX_WIG { bit IgnoresVEX_W = 1; } // Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX. -// FIXME: We should consider adding separate bits for VEX_WIG and the extra -// part of W1X. This would probably simplify the tablegen emitters and -// the TSFlags creation below. -class VEX_W1X { bits<2> VEX_WPrefix = 3; } +class VEX_W1X { bit HasVEX_W = 1; bit EVEX_W1_VEX_W0 = 1; } class VEX_4V : VEX { bit hasVEX_4V = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } @@ -296,7 +297,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasREPPrefix = 0; // Does this inst have a REP prefix? Encoding OpEnc = EncNormal; // Encoding used by this instruction bits<2> OpEncBits = OpEnc.Value; - bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit HasVEX_W = 0; // Does this inst set the VEX_W field? + bit IgnoresVEX_W = 0; // Does this inst ignore VEX_W field? + bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX + // instruction with VEX.W == 0. bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit @@ -311,11 +315,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix? - bits<2> EVEX_LL; - let EVEX_LL{0} = hasVEX_L; - let EVEX_LL{1} = hasEVEX_L2; // Vector size in bytes. - bits<7> VectSize = !shl(16, EVEX_LL); + bits<7> VectSize = !if(hasEVEX_L2, 64, !if(hasVEX_L, 32, 16)); // The scaling factor for AVX512's compressed displacement is either // - the size of a power-of-two number of elements or @@ -355,7 +356,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{29-28} = OpEncBits; let TSFlags{37-30} = Opcode; // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0. - let TSFlags{38} = VEX_WPrefix{0}; + let TSFlags{38} = HasVEX_W; let TSFlags{39} = hasVEX_4V; let TSFlags{40} = hasVEX_L; let TSFlags{41} = hasEVEX_K; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 3d508e2c34f3..096cc27861ca 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1,9 +1,8 @@ //===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -100,8 +99,10 @@ def X86insertps : SDNode<"X86ISD::INSERTPS", def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; -def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -127,21 +128,31 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT", def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, - SDTCisSameSizeAs<0, 1>]>>; + SDTCisOpSmallerThanOp<0, 1>]>>; -def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND", +def X86frounds : SDNode<"X86ISD::VFPROUNDS", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f64>, + SDTCisSameSizeAs<0, 2>]>>; + +def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f64>, SDTCisSameSizeAs<0, 2>, SDTCisVT<3, i32>]>>; -def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>, +def X86fpexts : SDNode<"X86ISD::VFPEXTS", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f32>, - SDTCisSameSizeAs<0, 2>, - SDTCisVT<3, i32>]>>; + SDTCisSameSizeAs<0, 2>]>>; +def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f32>, + SDTCisSameSizeAs<0, 2>]>>; def X86vmfpround: SDNode<"X86ISD::VMFPROUND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, @@ -164,25 +175,14 @@ def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; -def X86CmpMaskCCRound : - SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, - SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>, - SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, - SDTCisVT<4, i32>]>; def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86CmpMaskCCScalarRound : - SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; - def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -// Hack to make CMPM commutable in tablegen patterns for load folding. -def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>; -def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; -def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; +def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>; def X86phminpos: SDNode<"X86ISD::PHMINPOS", SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; @@ -301,25 +301,15 @@ def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i32>]>; -def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisVT<3, i32>, - SDTCisVT<4, i32>]>; -def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisInt<3>, - SDTCisSameSizeAs<0, 3>, - SDTCisSameNumEltsAs<0, 3>, - SDTCisVT<4, i32>, - SDTCisVT<5, i32>]>; -def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, +def SDTFPTernaryOpImm: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisInt<3>, + SDTCisSameSizeAs<0, 3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, i32>]>; +def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>]>; -def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, - SDTCisVT<3, i32>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, @@ -375,11 +365,23 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; -def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>; -def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>; - -def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>; -def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>; +def X86Movsd : SDNode<"X86ISD::MOVSD", + SDTypeProfile<1, 2, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v2f64>, + SDTCisVT<2, v2f64>]>>; +def X86Movss : SDNode<"X86ISD::MOVSS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", + SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, + SDTCisVT<1, v4f32>, + SDTCisVT<2, v4f32>]>>; def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>, SDTCisInt<1>, @@ -423,16 +425,18 @@ def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; -def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>; -def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>; +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImm>; +def X86VFixupimmSAE : SDNode<"X86ISD::VFIXUPIMM_SAE", SDTFPTernaryOpImm>; +def X86VFixupimms : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImm>; +def X86VFixupimmSAEs : SDNode<"X86ISD::VFIXUPIMMS_SAE", SDTFPTernaryOpImm>; def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>; -def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>; +def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>; def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>; -def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>; +def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>; def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>; -def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>; +def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>; def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>; -def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>; +def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>; def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, SDTCisFP<1>, @@ -450,27 +454,42 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; +def X86Blendv : SDNode<"X86ISD::BLENDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<2, 3>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0, 1>]>>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>; def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>; def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>; def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>; def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>; +def X86fmaxSAE : SDNode<"X86ISD::FMAX_SAE", SDTFPBinOp>; +def X86fmaxSAEs : SDNode<"X86ISD::FMAXS_SAE", SDTFPBinOp>; +def X86fminSAE : SDNode<"X86ISD::FMIN_SAE", SDTFPBinOp>; +def X86fminSAEs : SDNode<"X86ISD::FMINS_SAE", SDTFPBinOp>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOp>; +def X86scalefRnd : SDNode<"X86ISD::SCALEF_RND", SDTFPBinOpRound>; +def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOp>; +def X86scalefsRnd: SDNode<"X86ISD::SCALEFS_RND", SDTFPBinOpRound>; def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrts : SDNode<"X86ISD::FSQRTS", SDTFPBinOp>; def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; -def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; +def X86fgetexp : SDNode<"X86ISD::FGETEXP", SDTFPUnaryOp>; +def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>; +def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>; +def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>; def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; @@ -486,6 +505,10 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86vp2intersect : SDNode<"X86ISD::VP2INTERSECT", + SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, + SDTCisVec<1>, SDTCisSameAs<1, 2>]>>; + def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; @@ -502,27 +525,36 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>; def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; -def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; -def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; -def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>; +def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>; +def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>; +def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>; +def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>; +def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>; def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; -def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>; +def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>; +def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>; +def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>; def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>; def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>; def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>; def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>; -def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>; -def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>; -def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; -def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; - -def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, - [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; -def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, - [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +def X86RangesSAE : SDNode<"X86ISD::VRANGES_SAE", SDTFPBinOpImm>; +def X86RndScalesSAE : SDNode<"X86ISD::VRNDSCALES_SAE", SDTFPBinOpImm>; +def X86ReducesSAE : SDNode<"X86ISD::VREDUCES_SAE", SDTFPBinOpImm>; +def X86GetMantsSAE : SDNode<"X86ISD::VGETMANTS_SAE", SDTFPBinOpImm>; + +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>]>, []>; // vpshufbitqmb def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", @@ -531,6 +563,8 @@ def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", SDTCVecEltisVT<0,i1>, SDTCisSameNumEltsAs<0,1>]>>; +def SDTintToFP: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisVT<3, i32>]>; @@ -552,13 +586,15 @@ def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i32>]>; // Scalar +def X86SintToFp : SDNode<"X86ISD::SCALAR_SINT_TO_FP", SDTintToFP>; def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFp : SDNode<"X86ISD::SCALAR_UINT_TO_FP", SDTintToFP>; def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>; def X86cvtts2Int : SDNode<"X86ISD::CVTTS2SI", SDTSFloatToInt>; def X86cvtts2UInt : SDNode<"X86ISD::CVTTS2UI", SDTSFloatToInt>; -def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>; -def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>; +def X86cvtts2IntSAE : SDNode<"X86ISD::CVTTS2SI_SAE", SDTSFloatToInt>; +def X86cvtts2UIntSAE : SDNode<"X86ISD::CVTTS2UI_SAE", SDTSFloatToInt>; def X86cvts2si : SDNode<"X86ISD::CVTS2SI", SDTSFloatToInt>; def X86cvts2usi : SDNode<"X86ISD::CVTS2UI", SDTSFloatToInt>; @@ -568,8 +604,8 @@ def X86cvts2usiRnd : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff -def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>; -def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>; +def X86cvttp2siSAE : SDNode<"X86ISD::CVTTP2SI_SAE", SDTFloatToInt>; +def X86cvttp2uiSAE : SDNode<"X86ISD::CVTTP2UI_SAE", SDTFloatToInt>; def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>; def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>; @@ -592,6 +628,13 @@ def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; +// Masked versions of above +def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisInt<1>, + SDTCisSameSizeAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3>]>; def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>, SDTCisSameSizeAs<0, 1>, @@ -599,6 +642,9 @@ def SDTMFloatToInt: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<1, 3>]>; +def X86VMSintToFP : SDNode<"X86ISD::MCVTSI2P", SDTMVintToFP>; +def X86VMUintToFP : SDNode<"X86ISD::MCVTUI2P", SDTMVintToFP>; + def X86mcvtp2Int : SDNode<"X86ISD::MCVTP2SI", SDTMFloatToInt>; def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>; def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>; @@ -609,10 +655,9 @@ def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, i16>]> >; -def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>, - SDTCisVT<2, i32>]> >; +def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]> >; def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, @@ -625,17 +670,35 @@ def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTCisSameAs<0, 3>, SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<1, 4>]> >; -def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, +def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, SDTCVecEltisVT<1, f32>, - SDTCisOpSmallerThanOp<1, 0>, - SDTCisVT<2, i32>]>>; + SDTCisOpSmallerThanOp<1, 0>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, SDTCisOpSmallerThanOp<0, 1>, SDTCisVT<2, i32>]>>; +// cvt fp to bfloat16 +def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; +def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3>]>>; +def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>]>>; +def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0,1>, + SDTCVecEltisVT<2, i32>, + SDTCisSameAs<2,3>]>>; + // galois field arithmetic def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; @@ -655,18 +718,8 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot, SDNPWantParent]>; -def ssmem : Operand<v4f32> { - let PrintMethod = "printf32mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); - let ParserMatchClass = X86Mem32AsmOperand; - let OperandType = "OPERAND_MEMORY"; -} -def sdmem : Operand<v2f64> { - let PrintMethod = "printf64mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); - let ParserMatchClass = X86Mem64AsmOperand; - let OperandType = "OPERAND_MEMORY"; -} +def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; //===----------------------------------------------------------------------===// // SSE pattern fragments @@ -697,9 +750,9 @@ def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments -def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; -def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; -def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; // Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -886,15 +939,20 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; -def vzmovl_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; -def vzmovl_v4i32 : PatFrag<(ops node:$src), - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; +def X86vzload32 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4; +}]>; -def vzload_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzload node:$src)))>; +def X86vzload64 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; +}]>; + +def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), + (X86vextractst node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; +}]>; def fp32imm0 : PatLeaf<(f32 fpimm), [{ @@ -905,20 +963,6 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{ return N->isExactlyValue(+0.0); }]>; -def I8Imm : SDNodeXForm<imm, [{ - // Transformation function: get the low 8 bits. - return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); -}]>; - -def FROUND_NO_EXC : PatLeaf<(i32 8)>; -def FROUND_CURRENT : PatLeaf<(i32 4)>; - -// BYTE_imm - Transform bit immediates into byte immediates. -def BYTE_imm : SDNodeXForm<imm, [{ - // Transformation function: imm >> 3 - return getI32Imm(N->getZExtValue() >> 3, SDLoc(N)); -}]>; - // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ @@ -945,8 +989,10 @@ def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{}], - EXTRACT_get_vextract128_imm>; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract128_imm>; def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), @@ -956,8 +1002,10 @@ def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{}], - EXTRACT_get_vextract256_imm>; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract256_imm>; def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), @@ -965,70 +1013,46 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), [{}], INSERT_get_vinsert256_imm>; -def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ +def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_ld node:$src1, node:$src2, node:$src3), [{ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; -def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16; -}]>; - -def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32; -}]>; - -def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mload node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64; -}]>; - -def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), +def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_load node:$src1, node:$src2, node:$src3), [{ - return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && - cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; + // Use the node type to determine the size the alignment needs to match. + // We can't use memory VT because type widening changes the node VT, but + // not the memory VT. + auto *Ld = cast<MaskedLoadSDNode>(N); + return Ld->getAlignment() >= Ld->getValueType(0).getStoreSize(); }]>; def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ + (masked_ld node:$src1, node:$src2, node:$src3), [{ return cast<MaskedLoadSDNode>(N)->isExpandingLoad(); }]>; // Masked store fragments. // X86mstore can't be implemented in core DAG files because some targets // do not support vector types (llvm-tblgen will fail). -def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ +def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_st node:$src1, node:$src2, node:$src3), [{ return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); }]>; -def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16; -}]>; - -def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32; -}]>; - -def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64; -}]>; - -def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), +def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_store node:$src1, node:$src2, node:$src3), [{ - return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && - (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); + // Use the node type to determine the size the alignment needs to match. + // We can't use memory VT because type widening changes the node VT, but + // not the memory VT. + auto *St = cast<MaskedStoreSDNode>(N); + return St->getAlignment() >= St->getOperand(1).getValueType().getStoreSize(); }]>; def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (masked_st node:$src1, node:$src2, node:$src3), [{ return cast<MaskedStoreSDNode>(N)->isCompressingStore(); }]>; @@ -1036,7 +1060,7 @@ def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), // X86mtruncstore can't be implemented in core DAG files because some targets // doesn't support vector type ( llvm-tblgen will fail) def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (masked_st node:$src1, node:$src2, node:$src3), [{ return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); }]>; def masked_truncstorevi8 : diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index ab14ee7fadf2..dbe45356c42b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -220,16 +219,22 @@ static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { return true; case X86::MOV32rm: case X86::MOVSSrm: - case X86::VMOVSSZrm: + case X86::MOVSSrm_alt: case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: + case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::KMOVDkm: MemBytes = 4; return true; case X86::MOV64rm: case X86::LD_Fp64m: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::KMOVQkm: @@ -483,9 +488,10 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: - case X86::LD_Fp64m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -493,7 +499,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -510,7 +518,9 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MMX_MOVQ64rm: // AVX-512 case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: @@ -590,96 +600,12 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return true; } -bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { - MachineBasicBlock::iterator E = MBB.end(); - - // For compile time consideration, if we are not able to determine the - // safety after visiting 4 instructions in each direction, we will assume - // it's not safe. - MachineBasicBlock::iterator Iter = I; - for (unsigned i = 0; Iter != E && i < 4; ++i) { - bool SeenDef = false; - for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { - MachineOperand &MO = Iter->getOperand(j); - if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) - SeenDef = true; - if (!MO.isReg()) - continue; - if (MO.getReg() == X86::EFLAGS) { - if (MO.isUse()) - return false; - SeenDef = true; - } - } - - if (SeenDef) - // This instruction defines EFLAGS, no need to look any further. - return true; - ++Iter; - // Skip over debug instructions. - while (Iter != E && Iter->isDebugInstr()) - ++Iter; - } - - // It is safe to clobber EFLAGS at the end of a block of no successor has it - // live in. - if (Iter == E) { - for (MachineBasicBlock *S : MBB.successors()) - if (S->isLiveIn(X86::EFLAGS)) - return false; - return true; - } - - MachineBasicBlock::iterator B = MBB.begin(); - Iter = I; - for (unsigned i = 0; i < 4; ++i) { - // If we make it to the beginning of the block, it's safe to clobber - // EFLAGS iff EFLAGS is not live-in. - if (Iter == B) - return !MBB.isLiveIn(X86::EFLAGS); - - --Iter; - // Skip over debug instructions. - while (Iter != B && Iter->isDebugInstr()) - --Iter; - - bool SawKill = false; - for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { - MachineOperand &MO = Iter->getOperand(j); - // A register mask may clobber EFLAGS, but we should still look for a - // live EFLAGS def. - if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) - SawKill = true; - if (MO.isReg() && MO.getReg() == X86::EFLAGS) { - if (MO.isDef()) return MO.isDead(); - if (MO.isKill()) SawKill = true; - } - } - - if (SawKill) - // This instruction kills EFLAGS and doesn't redefine it, so - // there's no need to look further. - return true; - } - - // Conservative answer. - return false; -} - void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { - bool ClobbersEFLAGS = false; - for (const MachineOperand &MO : Orig.operands()) { - if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { - ClobbersEFLAGS = true; - break; - } - } - + bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. @@ -796,11 +722,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, - LiveVariables *LV) const { + LiveVariables *LV, bool Is8BitOp) const { // We handle 8-bit adds and various 16-bit opcodes in the switch below. - bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri); MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); - assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( + assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && "Unexpected type for LEA transform"); @@ -830,7 +755,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned Src = MI.getOperand(1).getReg(); bool IsDead = MI.getOperand(0).isDead(); bool IsKill = MI.getOperand(1).isKill(); - unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit; + unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); MachineInstr *InsMI = @@ -842,19 +767,23 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); + case X86::SHL8ri: case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); MIB.addReg(0).addImm(1ULL << ShAmt) .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); break; } + case X86::INC8r: case X86::INC16r: addRegOffset(MIB, InRegLEA, true, 1); break; + case X86::DEC8r: case X86::DEC16r: addRegOffset(MIB, InRegLEA, true, -1); break; case X86::ADD8ri: + case X86::ADD8ri_DB: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: @@ -862,6 +791,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); break; case X86::ADD8rr: + case X86::ADD8rr_DB: case X86::ADD16rr: case X86::ADD16rr_DB: { unsigned Src2 = MI.getOperand(2).getReg(); @@ -948,9 +878,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *NewMI = nullptr; bool Is64Bit = Subtarget.is64Bit(); + bool Is8BitOp = false; unsigned MIOpc = MI.getOpcode(); switch (MIOpc) { - default: return nullptr; + default: llvm_unreachable("Unreachable!"); case X86::SHL64ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); @@ -1000,12 +931,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } + case X86::SHL8ri: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::SHL16ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); } case X86::INC64r: case X86::INC32r: { @@ -1029,8 +963,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(MIB, 1); break; } - case X86::INC16r: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); case X86::DEC64r: case X86::DEC32r: { assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); @@ -1054,8 +986,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } + case X86::DEC8r: + case X86::INC8r: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::DEC16r: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + case X86::INC16r: + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: @@ -1094,9 +1031,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::ADD8rr: + case X86::ADD8rr_DB: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::ADD16rr: case X86::ADD16rr_DB: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: @@ -1130,11 +1070,59 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::ADD8ri: + case X86::ADD8ri_DB: + Is8BitOp = true; + LLVM_FALLTHROUGH; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV); + return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); + case X86::SUB8ri: + case X86::SUB16ri8: + case X86::SUB16ri: + /// FIXME: Support these similar to ADD8ri/ADD16ri*. + return nullptr; + case X86::SUB32ri8: + case X86::SUB32ri: { + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; + + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; + + bool isKill; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, ImplicitOp, LV)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .addReg(SrcReg, getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.add(ImplicitOp); + + NewMI = addOffset(MIB, -Imm); + break; + } + + case X86::SUB64ri8: + case X86::SUB64ri32: { + int64_t Imm = MI.getOperand(2).getImm(); + if (!isInt<32>(-Imm)) + return nullptr; + + assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); + + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), + get(X86::LEA64r)).add(Dest).add(Src); + NewMI = addOffset(MIB, -Imm); + break; + } + case X86::VMOVDQU8Z128rmk: case X86::VMOVDQU8Z256rmk: case X86::VMOVDQU8Zrmk: @@ -1522,7 +1510,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VBLENDPDrri: case X86::VBLENDPSrri: // If we're optimizing for size, try to use MOVSD/MOVSS. - if (MI.getParent()->getParent()->getFunction().optForSize()) { + if (MI.getParent()->getParent()->getFunction().hasOptSize()) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); @@ -1548,47 +1536,90 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VPBLENDWrri: case X86::VPBLENDDYrri: case X86::VPBLENDWYrri:{ - unsigned Mask; + int8_t Mask; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); - case X86::BLENDPDrri: Mask = 0x03; break; - case X86::BLENDPSrri: Mask = 0x0F; break; - case X86::PBLENDWrri: Mask = 0xFF; break; - case X86::VBLENDPDrri: Mask = 0x03; break; - case X86::VBLENDPSrri: Mask = 0x0F; break; - case X86::VBLENDPDYrri: Mask = 0x0F; break; - case X86::VBLENDPSYrri: Mask = 0xFF; break; - case X86::VPBLENDDrri: Mask = 0x0F; break; - case X86::VPBLENDWrri: Mask = 0xFF; break; - case X86::VPBLENDDYrri: Mask = 0xFF; break; - case X86::VPBLENDWYrri: Mask = 0xFF; break; + case X86::BLENDPDrri: Mask = (int8_t)0x03; break; + case X86::BLENDPSrri: Mask = (int8_t)0x0F; break; + case X86::PBLENDWrri: Mask = (int8_t)0xFF; break; + case X86::VBLENDPDrri: Mask = (int8_t)0x03; break; + case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break; + case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break; + case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break; + case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break; + case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break; } // Only the least significant bits of Imm are used. - unsigned Imm = MI.getOperand(3).getImm() & Mask; + // Using int8_t to ensure it will be sign extended to the int64_t that + // setImm takes in order to match isel behavior. + int8_t Imm = MI.getOperand(3).getImm() & Mask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: { + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + // We can commute insertps if we zero 2 of the elements, the insertion is + // "inline" and we don't override the insertion with a zero. + if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 && + countPopulation(ZMask) == 2) { + unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15); + assert(AltIdx < 4 && "Illegal insertion index"); + unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + return nullptr; + } case X86::MOVSDrr: case X86::MOVSSrr: case X86::VMOVSDrr: case X86::VMOVSSrr:{ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. - assert(Subtarget.hasSSE41() && "Commuting MOVSD/MOVSS requires SSE41!"); + if (Subtarget.hasSSE41()) { + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } - unsigned Mask, Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unreachable!"); - case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; - case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; - case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; - case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } + // Convert to SHUFPD. + assert(MI.getOpcode() == X86::MOVSDrr && + "Can only commute MOVSDrr without SSE4.1"); + auto &WorkingMI = cloneIfNew(MI); - WorkingMI.setDesc(get(Opc)); - WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + WorkingMI.setDesc(get(X86::SHUFPDrri)); + WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + case X86::SHUFPDrri: { + // Commute to MOVSD. + assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(X86::MOVSDrr)); + WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1657,7 +1688,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Flip permute source immediate. // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. - unsigned Imm = MI.getOperand(3).getImm() & 0xFF; + int8_t Imm = MI.getOperand(3).getImm() & 0xFF; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm ^ 0x22); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, @@ -1686,76 +1717,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: - case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: - case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: - case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: - case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr: - case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr: - case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: - case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: - case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: - case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: - case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: - case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: - case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: - case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: - case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: - case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unreachable!"); - case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; - case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; - case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; - case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; - case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; - case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; - case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; - case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; - case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; - case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; - case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; - case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; - case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; - case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; - case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; - case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; - case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; - case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; - case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; - case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; - case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; - case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; - case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; - case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; - case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; - case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; - case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; - case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; - case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; - case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; - case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; - case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; - case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; - case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; - case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; - case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; - case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; - case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; - case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; - case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; - case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; - case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; - case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; - case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; - case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; - case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; - case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; - case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; - } + case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { auto &WorkingMI = cloneIfNew(MI); - WorkingMI.setDesc(get(Opc)); + unsigned OpNo = MI.getDesc().getNumOperands() - 1; + X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm()); + WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1879,7 +1845,6 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // regardless of the FMA opcode. The FMA opcode is adjusted later. if (SrcOpIdx1 == CommuteAnyOperandIndex || SrcOpIdx2 == CommuteAnyOperandIndex) { - unsigned CommutableOpIdx1 = SrcOpIdx1; unsigned CommutableOpIdx2 = SrcOpIdx2; // At least one of operands to be commuted is not specified and @@ -1895,6 +1860,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); + + unsigned CommutableOpIdx1; for (CommutableOpIdx1 = LastCommutableVecOp; CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { // Just ignore and skip the k-mask operand. @@ -1946,28 +1913,43 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: - case X86::VCMPPSZ256rri: { + case X86::VCMPPSZ256rri: + case X86::VCMPPDZrrik: + case X86::VCMPPSZrrik: + case X86::VCMPPDZ128rrik: + case X86::VCMPPSZ128rrik: + case X86::VCMPPDZ256rrik: + case X86::VCMPPSZ256rrik: { + unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; + // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests - unsigned Imm = MI.getOperand(3).getImm() & 0x7; + unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; switch (Imm) { case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED - // The indices of the commutable operands are 1 and 2. + // The indices of the commutable operands are 1 and 2 (or 2 and 3 + // when masked). // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, + 2 + OpOffset); } return false; } - case X86::MOVSDrr: case X86::MOVSSrr: - case X86::VMOVSDrr: - case X86::VMOVSSrr: + // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can + // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since + // AVX implies sse4.1. if (Subtarget.hasSSE41()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; + case X86::SHUFPDrri: + // We can commute this to MOVSD. + if (MI.getOperand(3).getImm() == 0x02) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; case X86::MOVHLPSrr: case X86::UNPCKHPDrr: case X86::VMOVHLPSrr: @@ -2089,125 +2071,33 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, return false; } -X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) { - switch (BrOpc) { +X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::JE_1: return X86::COND_E; - case X86::JNE_1: return X86::COND_NE; - case X86::JL_1: return X86::COND_L; - case X86::JLE_1: return X86::COND_LE; - case X86::JG_1: return X86::COND_G; - case X86::JGE_1: return X86::COND_GE; - case X86::JB_1: return X86::COND_B; - case X86::JBE_1: return X86::COND_BE; - case X86::JA_1: return X86::COND_A; - case X86::JAE_1: return X86::COND_AE; - case X86::JS_1: return X86::COND_S; - case X86::JNS_1: return X86::COND_NS; - case X86::JP_1: return X86::COND_P; - case X86::JNP_1: return X86::COND_NP; - case X86::JO_1: return X86::COND_O; - case X86::JNO_1: return X86::COND_NO; - } -} - -/// Return condition code of a SET opcode. -X86::CondCode X86::getCondFromSETOpc(unsigned Opc) { - switch (Opc) { + case X86::JCC_1: + return static_cast<X86::CondCode>( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); + } +} + +/// Return condition code of a SETCC opcode. +X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::SETAr: case X86::SETAm: return X86::COND_A; - case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; - case X86::SETBr: case X86::SETBm: return X86::COND_B; - case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; - case X86::SETEr: case X86::SETEm: return X86::COND_E; - case X86::SETGr: case X86::SETGm: return X86::COND_G; - case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; - case X86::SETLr: case X86::SETLm: return X86::COND_L; - case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; - case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; - case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; - case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; - case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; - case X86::SETOr: case X86::SETOm: return X86::COND_O; - case X86::SETPr: case X86::SETPm: return X86::COND_P; - case X86::SETSr: case X86::SETSm: return X86::COND_S; + case X86::SETCCr: case X86::SETCCm: + return static_cast<X86::CondCode>( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } /// Return condition code of a CMov opcode. -X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { - switch (Opc) { +X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: - case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: - return X86::COND_A; - case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: - case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: - return X86::COND_AE; - case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: - case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: - return X86::COND_B; - case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: - case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: - return X86::COND_BE; - case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: - case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: - return X86::COND_E; - case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: - case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: - return X86::COND_G; - case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: - case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: - return X86::COND_GE; - case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: - case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: - return X86::COND_L; - case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: - case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: - return X86::COND_LE; - case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: - case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: - return X86::COND_NE; - case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: - case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: - return X86::COND_NO; - case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: - case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: - return X86::COND_NP; - case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: - case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: - return X86::COND_NS; - case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: - case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: - return X86::COND_O; - case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: - case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: - return X86::COND_P; - case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: - case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: - return X86::COND_S; - } -} - -unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { - switch (CC) { - default: llvm_unreachable("Illegal condition code!"); - case X86::COND_E: return X86::JE_1; - case X86::COND_NE: return X86::JNE_1; - case X86::COND_L: return X86::JL_1; - case X86::COND_LE: return X86::JLE_1; - case X86::COND_G: return X86::JG_1; - case X86::COND_GE: return X86::JGE_1; - case X86::COND_B: return X86::JB_1; - case X86::COND_BE: return X86::JBE_1; - case X86::COND_A: return X86::JA_1; - case X86::COND_AE: return X86::JAE_1; - case X86::COND_S: return X86::JS_1; - case X86::COND_NS: return X86::JNS_1; - case X86::COND_P: return X86::JP_1; - case X86::COND_NP: return X86::JNP_1; - case X86::COND_O: return X86::JO_1; - case X86::COND_NO: return X86::JNO_1; + case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: + case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm: + return static_cast<X86::CondCode>( + MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } @@ -2293,78 +2183,18 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) { return std::make_pair(CC, NeedSwap); } -/// Return a set opcode for the given condition and -/// whether it has memory operand. -unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { - static const uint16_t Opc[16][2] = { - { X86::SETAr, X86::SETAm }, - { X86::SETAEr, X86::SETAEm }, - { X86::SETBr, X86::SETBm }, - { X86::SETBEr, X86::SETBEm }, - { X86::SETEr, X86::SETEm }, - { X86::SETGr, X86::SETGm }, - { X86::SETGEr, X86::SETGEm }, - { X86::SETLr, X86::SETLm }, - { X86::SETLEr, X86::SETLEm }, - { X86::SETNEr, X86::SETNEm }, - { X86::SETNOr, X86::SETNOm }, - { X86::SETNPr, X86::SETNPm }, - { X86::SETNSr, X86::SETNSm }, - { X86::SETOr, X86::SETOm }, - { X86::SETPr, X86::SETPm }, - { X86::SETSr, X86::SETSm } - }; - - assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); - return Opc[CC][HasMemoryOperand ? 1 : 0]; -} - -/// Return a cmov opcode for the given condition, -/// register size in bytes, and operand type. -unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, - bool HasMemoryOperand) { - static const uint16_t Opc[32][3] = { - { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, - { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, - { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, - { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, - { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, - { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, - { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, - { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, - { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, - { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, - { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, - { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, - { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, - { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, - { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, - { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, - { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, - { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, - { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, - { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, - { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, - { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, - { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, - { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, - { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, - { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, - { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, - { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, - { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, - { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, - { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, - { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } - }; +/// Return a setcc opcode based on whether it has memory operand. +unsigned X86::getSETOpc(bool HasMemoryOperand) { + return HasMemoryOperand ? X86::SETCCr : X86::SETCCm; +} - assert(CC < 16 && "Can only handle standard cond codes"); - unsigned Idx = HasMemoryOperand ? 16+CC : CC; +/// Return a cmov opcode for the given register size in bytes, and operand type. +unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) { switch(RegBytes) { default: llvm_unreachable("Illegal register size!"); - case 2: return Opc[Idx][0]; - case 4: return Opc[Idx][1]; - case 8: return Opc[Idx][2]; + case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr; + case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr; + case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr; } } @@ -2490,7 +2320,7 @@ void X86InstrInfo::replaceBranchWithTailCall( if (!I->isBranch()) assert(0 && "Can't find the branch to replace!"); - X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode CC = X86::getCondFromBranch(*I); assert(BranchCond.size() == 1); if (CC != BranchCond[0].getImm()) continue; @@ -2597,13 +2427,13 @@ bool X86InstrInfo::AnalyzeBranchImpl( } // Handle conditional branches. - X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode BranchCode = X86::getCondFromBranch(*I); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. // In practice we should never have an undef eflags operand, if we do // abort here as we are not prepared to preserve the flag. - if (I->getOperand(1).isUndef()) + if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef()) return true; // Working from the bottom, handle the first conditional branch. @@ -2629,11 +2459,11 @@ bool X86InstrInfo::AnalyzeBranchImpl( // Which is a bit more efficient. // We conditionally jump to the fall-through block. BranchCode = GetOppositeBranchCondition(BranchCode); - unsigned JNCC = GetCondBranchFromCond(BranchCode); MachineBasicBlock::iterator OldInst = I; - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) - .addMBB(UnCondBrIter->getOperand(0).getMBB()); + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1)) + .addMBB(UnCondBrIter->getOperand(0).getMBB()) + .addImm(BranchCode); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); @@ -2798,7 +2628,7 @@ unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, if (I->isDebugInstr()) continue; if (I->getOpcode() != X86::JMP_1 && - X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + X86::getCondFromBranch(*I) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); @@ -2837,9 +2667,9 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, switch (CC) { case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. - BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE); ++Count; - BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P); ++Count; break; case X86::COND_E_AND_NP: @@ -2850,14 +2680,13 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, "body is a fall-through."); } // Synthesize COND_E_AND_NP with two branches. - BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE); ++Count; - BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP); ++Count; break; default: { - unsigned Opc = GetCondBranchFromCond(CC); - BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC); ++Count; } } @@ -2880,7 +2709,7 @@ canInsertSelect(const MachineBasicBlock &MBB, if (Cond.size() != 1) return false; // We cannot do the composite conditions, at least not in SSA form. - if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) + if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND) return false; // Check register classes. @@ -2915,10 +2744,12 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); assert(Cond.size() == 1 && "Invalid Cond array"); - unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), - TRI.getRegSizeInBits(RC) / 8, - false /*HasMemoryOperand*/); - BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); + unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8, + false /*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addImm(Cond[0].getImm()); } /// Test if the given register is a physical h register. @@ -2984,22 +2815,22 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return X86::MMX_MOVD64to64rr; } - // SrcReg(FR32) -> DestReg(GR32) - // SrcReg(GR32) -> DestReg(FR32) + // SrcReg(VR128) -> DestReg(GR32) + // SrcReg(GR32) -> DestReg(VR128) if (X86::GR32RegClass.contains(DestReg) && - X86::FR32XRegClass.contains(SrcReg)) - // Copy from a FR32 register to a GR32 register. - return HasAVX512 ? X86::VMOVSS2DIZrr : - HasAVX ? X86::VMOVSS2DIrr : - X86::MOVSS2DIrr; + X86::VR128XRegClass.contains(SrcReg)) + // Copy from a VR128 register to a GR32 register. + return HasAVX512 ? X86::VMOVPDI2DIZrr : + HasAVX ? X86::VMOVPDI2DIrr : + X86::MOVPDI2DIrr; - if (X86::FR32XRegClass.contains(DestReg) && + if (X86::VR128XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) - // Copy from a GR32 register to a FR32 register. - return HasAVX512 ? X86::VMOVDI2SSZrr : - HasAVX ? X86::VMOVDI2SSrr : - X86::MOVDI2SSrr; + // Copy from a VR128 register to a VR128 register. + return HasAVX512 ? X86::VMOVDI2PDIZrr : + HasAVX ? X86::VMOVDI2PDIrr : + X86::MOVDI2PDIrr; return 0; } @@ -3129,22 +2960,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::MOV32rm : X86::MOV32mr; if (X86::FR32XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : - (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + (HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt) : + (HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : + X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; if (X86::VK32RegClass.hasSubClassEq(RC)) { assert(STI.hasBWI() && "KMOVD requires BWI"); return load ? X86::KMOVDkm : X86::KMOVDmk; } + // All of these mask pair classes have the same spill size, the same kind + // of kmov instructions can be used with all of them. + if (X86::VK1PAIRRegClass.hasSubClassEq(RC) || + X86::VK2PAIRRegClass.hasSubClassEq(RC) || + X86::VK4PAIRRegClass.hasSubClassEq(RC) || + X86::VK8PAIRRegClass.hasSubClassEq(RC) || + X86::VK16PAIRRegClass.hasSubClassEq(RC)) + return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; if (X86::FR64XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : - (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + (HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt) : + (HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : + X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) @@ -3219,7 +3066,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } bool X86InstrInfo::getMemOperandWithOffset( - MachineInstr &MemOp, MachineOperand *&BaseOp, int64_t &Offset, + const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); @@ -3572,25 +3419,39 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) { static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; - case X86::LZCNT16rr: case X86::LZCNT16rm: - case X86::LZCNT32rr: case X86::LZCNT32rm: - case X86::LZCNT64rr: case X86::LZCNT64rm: + case X86::NEG8r: + case X86::NEG16r: + case X86::NEG32r: + case X86::NEG64r: + return X86::COND_AE; + case X86::LZCNT16rr: + case X86::LZCNT32rr: + case X86::LZCNT64rr: return X86::COND_B; - case X86::POPCNT16rr:case X86::POPCNT16rm: - case X86::POPCNT32rr:case X86::POPCNT32rm: - case X86::POPCNT64rr:case X86::POPCNT64rm: + case X86::POPCNT16rr: + case X86::POPCNT32rr: + case X86::POPCNT64rr: return X86::COND_E; - case X86::TZCNT16rr: case X86::TZCNT16rm: - case X86::TZCNT32rr: case X86::TZCNT32rm: - case X86::TZCNT64rr: case X86::TZCNT64rm: + case X86::TZCNT16rr: + case X86::TZCNT32rr: + case X86::TZCNT64rr: return X86::COND_B; - case X86::BSF16rr: case X86::BSF16rm: - case X86::BSF32rr: case X86::BSF32rm: - case X86::BSF64rr: case X86::BSF64rm: - case X86::BSR16rr: case X86::BSR16rm: - case X86::BSR32rr: case X86::BSR32rm: - case X86::BSR64rr: case X86::BSR64rm: + case X86::BSF16rr: + case X86::BSF32rr: + case X86::BSF64rr: + case X86::BSR16rr: + case X86::BSR32rr: + case X86::BSR64rr: return X86::COND_E; + case X86::BLSI32rr: + case X86::BLSI64rr: + return X86::COND_AE; + case X86::BLSR32rr: + case X86::BLSR64rr: + case X86::BLSMSK32rr: + case X86::BLSMSK64rr: + return X86::COND_B; + // TODO: TBM instructions. } } @@ -3602,7 +3463,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. - unsigned NewOpcode = 0; switch (CmpInstr.getOpcode()) { default: break; case X86::SUB64ri32: @@ -3623,6 +3483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; // There is no use of the destination register, we can replace SUB with CMP. + unsigned NewOpcode = 0; switch (CmpInstr.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; @@ -3746,7 +3607,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // If we are done with the basic block, we need to check whether EFLAGS is // live-out. bool IsSafe = false; - SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate; + SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate; MachineBasicBlock::iterator E = CmpInstr.getParent()->end(); for (++I; I != E; ++I) { const MachineInstr &Instr = *I; @@ -3763,17 +3624,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; - bool OpcIsSET = false; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) - OldCC = X86::getCondFromBranchOpc(Instr.getOpcode()); + OldCC = X86::getCondFromBranch(Instr); else { - OldCC = X86::getCondFromSETOpc(Instr.getOpcode()); - if (OldCC != X86::COND_INVALID) - OpcIsSET = true; - else - OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); + OldCC = X86::getCondFromSETCC(Instr); + if (OldCC == X86::COND_INVALID) + OldCC = X86::getCondFromCMov(Instr); } if (OldCC == X86::COND_INVALID) return false; } @@ -3818,24 +3676,10 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { - // Synthesize the new opcode. - bool HasMemoryOperand = Instr.hasOneMemOperand(); - unsigned NewOpc; - if (Instr.isBranch()) - NewOpc = GetCondBranchFromCond(ReplacementCC); - else if(OpcIsSET) - NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand); - else { - unsigned DstReg = Instr.getOperand(0).getReg(); - const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8, - HasMemoryOperand); - } - // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. - OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); + OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC)); } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. @@ -3876,21 +3720,17 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } // Make sure Sub instruction defines EFLAGS and mark the def live. - unsigned i = 0, e = Sub->getNumOperands(); - for (; i != e; ++i) { - MachineOperand &MO = Sub->getOperand(i); - if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { - MO.setIsDead(false); - break; - } - } - assert(i != e && "Unable to locate a def EFLAGS operand"); + MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS); + assert(FlagDef && "Unable to locate a def EFLAGS operand"); + FlagDef->setIsDead(false); CmpInstr.eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (auto &Op : OpsToUpdate) - Op.first->setDesc(get(Op.second)); + for (auto &Op : OpsToUpdate) { + Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1) + .setImm(Op.second); + } return true; } @@ -4128,6 +3968,20 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB, return true; } + +static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { + MIB->setDesc(Desc); + int64_t ShiftAmt = MIB->getOperand(2).getImm(); + // Temporarily remove the immediate so we can add another source register. + MIB->RemoveOperand(2); + // Add the register. Don't copy the kill flag if there is one. + MIB.addReg(MIB->getOperand(1).getReg(), + getUndefRegState(MIB->getOperand(1).isUndef())); + // Add back the immediate. + MIB.addImm(ShiftAmt); + return true; +} + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -4193,6 +4047,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB.addReg(SrcReg, RegState::ImplicitDefine); return true; } + if (MI.getOpcode() == X86::AVX512_256_SET0) { + // No VLX so we must reference a zmm. + unsigned ZReg = + TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(ZReg); + } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } case X86::V_SETALLONES: @@ -4282,6 +4142,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::XOR64_FP: case X86::XOR32_FP: return expandXorFP(MIB, *this); + case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8)); + case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8)); + case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8)); + case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8)); + case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break; + case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break; + case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break; + case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break; + case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break; + case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break; + case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break; + case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break; + case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break; + case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break; } return false; } @@ -4303,7 +4178,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { /// FIXME: This should be turned into a TSFlags. /// static bool hasPartialRegUpdate(unsigned Opcode, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, + bool ForLoadFold = false) { switch (Opcode) { case X86::CVTSI2SSrr: case X86::CVTSI2SSrm: @@ -4313,6 +4189,9 @@ static bool hasPartialRegUpdate(unsigned Opcode, case X86::CVTSI2SDrm: case X86::CVTSI642SDrr: case X86::CVTSI642SDrm: + // Load folding won't effect the undef register update since the input is + // a GPR. + return !ForLoadFold; case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: case X86::CVTSS2SDrr: @@ -4389,7 +4268,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode) { +static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { switch (Opcode) { case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: @@ -4407,38 +4286,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTSI642SDrm: case X86::VCVTSI642SDrr_Int: case X86::VCVTSI642SDrm_Int: - case X86::VCVTSD2SSrr: - case X86::VCVTSD2SSrm: - case X86::VCVTSD2SSrr_Int: - case X86::VCVTSD2SSrm_Int: - case X86::VCVTSS2SDrr: - case X86::VCVTSS2SDrm: - case X86::VCVTSS2SDrr_Int: - case X86::VCVTSS2SDrm_Int: - case X86::VRCPSSr: - case X86::VRCPSSr_Int: - case X86::VRCPSSm: - case X86::VRCPSSm_Int: - case X86::VROUNDSDr: - case X86::VROUNDSDm: - case X86::VROUNDSDr_Int: - case X86::VROUNDSDm_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSm: - case X86::VROUNDSSr_Int: - case X86::VROUNDSSm_Int: - case X86::VRSQRTSSr: - case X86::VRSQRTSSr_Int: - case X86::VRSQRTSSm: - case X86::VRSQRTSSm_Int: - case X86::VSQRTSSr: - case X86::VSQRTSSr_Int: - case X86::VSQRTSSm: - case X86::VSQRTSSm_Int: - case X86::VSQRTSDr: - case X86::VSQRTSDr_Int: - case X86::VSQRTSDm: - case X86::VSQRTSDm_Int: // AVX-512 case X86::VCVTSI2SSZrr: case X86::VCVTSI2SSZrm: @@ -4453,7 +4300,6 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTSI2SDZrr: case X86::VCVTSI2SDZrm: case X86::VCVTSI2SDZrr_Int: - case X86::VCVTSI2SDZrrb_Int: case X86::VCVTSI2SDZrm_Int: case X86::VCVTSI642SDZrr: case X86::VCVTSI642SDZrm: @@ -4479,6 +4325,42 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTUSI642SDZrr_Int: case X86::VCVTUSI642SDZrrb_Int: case X86::VCVTUSI642SDZrm_Int: + // Load folding won't effect the undef register update since the input is + // a GPR. + return !ForLoadFold; + case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: + case X86::VCVTSD2SSrr_Int: + case X86::VCVTSD2SSrm_Int: + case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: + case X86::VCVTSS2SDrr_Int: + case X86::VCVTSS2SDrm_Int: + case X86::VRCPSSr: + case X86::VRCPSSr_Int: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: + case X86::VROUNDSDr: + case X86::VROUNDSDm: + case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSm: + case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: + case X86::VRSQRTSSr: + case X86::VRSQRTSSr_Int: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: + case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 case X86::VCVTSD2SSZrr: case X86::VCVTSD2SSZrr_Int: case X86::VCVTSD2SSZrrb_Int: @@ -4759,7 +4641,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 4 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = @@ -4783,7 +4665,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 8 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : @@ -4794,13 +4676,29 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; - }; + case X86::UNPCKLPDrr: + // If we won't be able to fold this to the memory form of UNPCKL, use + // MOVHPD instead. Done as custom because we can't have this in the load + // table twice. + if (OpNum == 2) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) { + MachineInstr *NewMI = + FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); + return NewMI; + } + } + break; + } return nullptr; } -static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { - if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) || +static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, + MachineInstr &MI) { + if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; @@ -4828,15 +4726,15 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // For CPUs that favor the register form of a call or push, // do not fold loads into calls or pushes, unless optimizing for size // aggressively. - if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() && + if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() && (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || MI.getOpcode() == X86::PUSH64r)) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -4899,6 +4797,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size < RCSize) { + // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) @@ -4937,9 +4836,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI.getDesc().getNumDefs(); - unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0; - unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); - unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); + Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register(); + Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); + Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); bool Tied1 = 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied2 = @@ -4997,14 +4896,15 @@ MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, - int FrameIndex, LiveIntervals *LIS) const { + int FrameIndex, LiveIntervals *LIS, + VirtRegMap *VRM) const { // Check switch flag if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -5073,7 +4973,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); unsigned RegSize = TRI.getRegSizeInBits(*RC); - if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) && + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm || + Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt || + Opc == X86::VMOVSSZrm_alt) && RegSize > 32) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user @@ -5087,6 +4989,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: + case X86::VCMPSSZrr_Intk: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: @@ -5124,7 +5027,9 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } } - if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) && + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm || + Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt || + Opc == X86::VMOVSDZrm_alt) && RegSize > 64) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user @@ -5138,6 +5043,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: + case X86::VCMPSDZrr_Intk: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: @@ -5203,8 +5109,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. - if (!MF.getFunction().optForSize() && - (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + if (!MF.getFunction().hasOptSize() && + (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; @@ -5359,10 +5265,7 @@ extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { } else { // Clone the MMO and unset the store flag. LoadMMOs.push_back(MF.getMachineMemOperand( - MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOStore, - MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr, - MMO->getSyncScopeID(), MMO->getOrdering(), - MMO->getFailureOrdering())); + MMO, MMO->getFlags() & ~MachineMemOperand::MOStore)); } } @@ -5383,10 +5286,7 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { } else { // Clone the MMO and unset the load flag. StoreMMOs.push_back(MF.getMachineMemOperand( - MMO->getPointerInfo(), MMO->getFlags() & ~MachineMemOperand::MOLoad, - MMO->getSize(), MMO->getBaseAlignment(), MMO->getAAInfo(), nullptr, - MMO->getSyncScopeID(), MMO->getOrdering(), - MMO->getFailureOrdering())); + MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad)); } } @@ -5668,7 +5568,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: @@ -5679,7 +5581,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -5694,7 +5598,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: @@ -5745,7 +5651,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: + case X86::MOVSSrm_alt: case X86::MOVSDrm: + case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: @@ -5756,7 +5664,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: + case X86::VMOVSSrm_alt: case X86::VMOVSDrm: + case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -5771,7 +5681,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: + case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: + case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: @@ -5943,7 +5855,9 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr }, { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, + { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm }, { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, + { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -5973,7 +5887,9 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr }, { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, + { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm }, { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, + { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, @@ -6012,13 +5928,17 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm }, + { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm }, { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, + { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm }, { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r }, { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m }, { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r }, { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m }, { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr }, { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm }, + { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r }, + { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r }, { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, @@ -6109,6 +6029,8 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, + { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm}, + { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr}, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, @@ -6128,6 +6050,19 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr }, }; +static const uint16_t ReplaceableInstrsFP[][3] = { + //PackedSingle PackedDouble + { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END }, +}; + static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { //PackedSingle PackedDouble PackedInt { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, @@ -6368,7 +6303,7 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { }; // NOTE: These should only be used by the custom domain methods. -static const uint16_t ReplaceableCustomInstrs[][3] = { +static const uint16_t ReplaceableBlendInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi }, { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri }, @@ -6377,7 +6312,7 @@ static const uint16_t ReplaceableCustomInstrs[][3] = { { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi }, { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri }, }; -static const uint16_t ReplaceableCustomAVX2Instrs[][3] = { +static const uint16_t ReplaceableBlendAVX2Instrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi }, { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri }, @@ -6552,6 +6487,8 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const { MI.getOperand(2).getSubReg() == 0) return 0x6; return 0; + case X86::SHUFPDrri: + return 0x6; } return 0; } @@ -6571,9 +6508,9 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm); unsigned NewImm = Imm; - const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs); + const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs); if (!table) - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); if (Domain == 1) { // PackedSingle AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); @@ -6583,7 +6520,7 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, if (Subtarget.hasAVX2()) { // If we are already VPBLENDW use that, else use VPBLENDD. if ((ImmWidth / (Is256 ? 2 : 1)) != 8) { - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); } } else { @@ -6672,6 +6609,18 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, // We must always return true for MOVHLPSrr. if (Opcode == X86::MOVHLPSrr) return true; + break; + case X86::SHUFPDrri: { + if (Domain == 1) { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned NewImm = 0x44; + if (Imm & 1) NewImm |= 0x0a; + if (Imm & 2) NewImm |= 0xa0; + MI.getOperand(3).setImm(NewImm); + MI.setDesc(get(X86::SHUFPSrri)); + } + return true; + } } return false; } @@ -6691,6 +6640,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { validDomains = 0xe; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookup(opcode, domain, ReplaceableInstrsFP)) { + validDomains = 0x6; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { // Insert/extract instructions should only effect domain if AVX2 // is enabled. @@ -6730,6 +6681,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { "256-bit vector operations only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); } + if (!table) { // try the FP table + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP); + assert((!table || Domain < 3) && + "Can only select PackedSingle or PackedDouble"); + } if (!table) { // try the other table assert(Subtarget.hasAVX2() && "256-bit insert/extract only available in AVX2"); @@ -7140,6 +7096,20 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::PADDWrr: case X86::PADDDrr: case X86::PADDQrr: + case X86::PMULLWrr: + case X86::PMULLDrr: + case X86::PMAXSBrr: + case X86::PMAXSDrr: + case X86::PMAXSWrr: + case X86::PMAXUBrr: + case X86::PMAXUDrr: + case X86::PMAXUWrr: + case X86::PMINSBrr: + case X86::PMINSDrr: + case X86::PMINSWrr: + case X86::PMINUBrr: + case X86::PMINUDrr: + case X86::PMINUWrr: case X86::VPANDrr: case X86::VPANDYrr: case X86::VPANDDZ128rr: @@ -7243,6 +7213,78 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VPMULLQZ128rr: case X86::VPMULLQZ256rr: case X86::VPMULLQZrr: + case X86::VPMAXSBrr: + case X86::VPMAXSBYrr: + case X86::VPMAXSBZ128rr: + case X86::VPMAXSBZ256rr: + case X86::VPMAXSBZrr: + case X86::VPMAXSDrr: + case X86::VPMAXSDYrr: + case X86::VPMAXSDZ128rr: + case X86::VPMAXSDZ256rr: + case X86::VPMAXSDZrr: + case X86::VPMAXSQZ128rr: + case X86::VPMAXSQZ256rr: + case X86::VPMAXSQZrr: + case X86::VPMAXSWrr: + case X86::VPMAXSWYrr: + case X86::VPMAXSWZ128rr: + case X86::VPMAXSWZ256rr: + case X86::VPMAXSWZrr: + case X86::VPMAXUBrr: + case X86::VPMAXUBYrr: + case X86::VPMAXUBZ128rr: + case X86::VPMAXUBZ256rr: + case X86::VPMAXUBZrr: + case X86::VPMAXUDrr: + case X86::VPMAXUDYrr: + case X86::VPMAXUDZ128rr: + case X86::VPMAXUDZ256rr: + case X86::VPMAXUDZrr: + case X86::VPMAXUQZ128rr: + case X86::VPMAXUQZ256rr: + case X86::VPMAXUQZrr: + case X86::VPMAXUWrr: + case X86::VPMAXUWYrr: + case X86::VPMAXUWZ128rr: + case X86::VPMAXUWZ256rr: + case X86::VPMAXUWZrr: + case X86::VPMINSBrr: + case X86::VPMINSBYrr: + case X86::VPMINSBZ128rr: + case X86::VPMINSBZ256rr: + case X86::VPMINSBZrr: + case X86::VPMINSDrr: + case X86::VPMINSDYrr: + case X86::VPMINSDZ128rr: + case X86::VPMINSDZ256rr: + case X86::VPMINSDZrr: + case X86::VPMINSQZ128rr: + case X86::VPMINSQZ256rr: + case X86::VPMINSQZrr: + case X86::VPMINSWrr: + case X86::VPMINSWYrr: + case X86::VPMINSWZ128rr: + case X86::VPMINSWZ256rr: + case X86::VPMINSWZrr: + case X86::VPMINUBrr: + case X86::VPMINUBYrr: + case X86::VPMINUBZ128rr: + case X86::VPMINUBZ256rr: + case X86::VPMINUBZrr: + case X86::VPMINUDrr: + case X86::VPMINUDYrr: + case X86::VPMINUDZ128rr: + case X86::VPMINUDZ256rr: + case X86::VPMINUDZrr: + case X86::VPMINUQZ128rr: + case X86::VPMINUQZ256rr: + case X86::VPMINUQZrr: + case X86::VPMINUWrr: + case X86::VPMINUWYrr: + case X86::VPMINUWZ128rr: + case X86::VPMINUWZ256rr: + case X86::VPMINUWZrr: // Normal min/max instructions are not commutative because of NaN and signed // zero semantics, but these are. Thus, there's no need to check for global // relaxed math; the instructions themselves have the properties we need. @@ -7698,7 +7740,7 @@ bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF, // Does the function use a red zone? If it does, then we can't risk messing // with the stack. - if (!F.hasFnAttribute(Attribute::NoRedZone)) { + if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) { // It could have a red zone. If it does, then we don't want to touch it. const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); if (!X86FI || X86FI->getUsesRedZone()) diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 159cb50afc5c..13ca17139494 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -1,9 +1,8 @@ //===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -36,62 +35,24 @@ enum AsmComments { AC_EVEX_2_VEX = MachineInstr::TAsmComments }; -// X86 specific condition code. These correspond to X86_*_COND in -// X86InstrInfo.td. They must be kept in synch. -enum CondCode { - COND_A = 0, - COND_AE = 1, - COND_B = 2, - COND_BE = 3, - COND_E = 4, - COND_G = 5, - COND_GE = 6, - COND_L = 7, - COND_LE = 8, - COND_NE = 9, - COND_NO = 10, - COND_NP = 11, - COND_NS = 12, - COND_O = 13, - COND_P = 14, - COND_S = 15, - LAST_VALID_COND = COND_S, - - // Artificial condition codes. These are used by AnalyzeBranch - // to indicate a block terminated with two conditional branches that together - // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, - // which can't be represented on x86 with a single condition. These - // are never used in MachineInstrs and are inverses of one another. - COND_NE_OR_P, - COND_E_AND_NP, - - COND_INVALID -}; - -// Turn condition code into conditional branch opcode. -unsigned GetCondBranchFromCond(CondCode CC); - /// Return a pair of condition code for the given predicate and whether /// the instruction operands should be swaped to match the condition code. std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate); -/// Return a set opcode for the given condition and whether it has -/// a memory operand. -unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); +/// Return a setcc opcode based on whether it has a memory operand. +unsigned getSETOpc(bool HasMemoryOperand = false); -/// Return a cmov opcode for the given condition, register size in -/// bytes, and operand type. -unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, - bool HasMemoryOperand = false); +/// Return a cmov opcode for the given register size in bytes, and operand type. +unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false); -// Turn jCC opcode into condition code. -CondCode getCondFromBranchOpc(unsigned Opc); +// Turn jCC instruction into condition code. +CondCode getCondFromBranch(const MachineInstr &MI); -// Turn setCC opcode into condition code. -CondCode getCondFromSETOpc(unsigned Opc); +// Turn setCC instruction into condition code. +CondCode getCondFromSETCC(const MachineInstr &MI); -// Turn CMov opcode into condition code. -CondCode getCondFromCMovOpc(unsigned Opc); +// Turn CMov instruction into condition code. +CondCode getCondFromCMov(const MachineInstr &MI); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. @@ -327,7 +288,8 @@ public: SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, @@ -388,7 +350,8 @@ public: foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, - LiveIntervals *LIS = nullptr) const override; + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific @@ -453,7 +416,10 @@ public: /// conservative. If it cannot definitely determine the safety after visiting /// a few instructions in each direction it assumes it's not safe. bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + MachineBasicBlock::iterator I) const { + return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) == + MachineBasicBlock::LQR_Dead; + } /// True if MI has a condition code def, e.g. EFLAGS, that is /// not marked dead. @@ -590,7 +556,8 @@ private: MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, - LiveVariables *LV) const; + LiveVariables *LV, + bool Is8BitOp) const; /// Handles memory folding for special case instructions, for instance those /// requiring custom manipulation of the address. diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 4ec4d566ca99..8e05dd8ec5c1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -1,9 +1,8 @@ //===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,6 +63,10 @@ def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86rdpkru : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; +def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -124,6 +127,9 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; +def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>; + def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, @@ -152,6 +158,11 @@ def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; +def X86rdpkru : SDNode<"X86ISD::RDPKRU", SDTX86rdpkru, + [SDNPHasChain, SDNPSideEffect]>; +def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru, + [SDNPHasChain, SDNPSideEffect]>; + def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -206,13 +217,6 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; -def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; - def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -306,6 +310,11 @@ def X86tpause : SDNode<"X86ISD::TPAUSE", SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, [SDNPHasChain, SDNPSideEffect]>; +def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD, + [SDNPHasChain, SDNPSideEffect]>; +def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD, + [SDNPHasChain, SDNPSideEffect]>; + //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -371,37 +380,35 @@ def anymem : X86MemOperand<"printanymem">; // restrict to only unsized memory. def opaquemem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; -def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; -def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; -def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; -def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; -def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; -def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; -def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; -def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; -def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; -def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; -def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; -def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; - -def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; +def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; // Gather mem operands -def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>; -def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>; -def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>; -def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>; -def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>; - -def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>; -def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>; -def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>; -def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>; -def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>; -def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>; -def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>; -def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>; +def vx64mem : X86VMemOperand<VR128, "printqwordmem", X86Mem64_RC128Operand>; +def vx128mem : X86VMemOperand<VR128, "printxmmwordmem", X86Mem128_RC128Operand>; +def vx256mem : X86VMemOperand<VR128, "printymmwordmem", X86Mem256_RC128Operand>; +def vy128mem : X86VMemOperand<VR256, "printxmmwordmem", X86Mem128_RC256Operand>; +def vy256mem : X86VMemOperand<VR256, "printymmwordmem", X86Mem256_RC256Operand>; + +def vx64xmem : X86VMemOperand<VR128X, "printqwordmem", X86Mem64_RC128XOperand>; +def vx128xmem : X86VMemOperand<VR128X, "printxmmwordmem", X86Mem128_RC128XOperand>; +def vx256xmem : X86VMemOperand<VR128X, "printymmwordmem", X86Mem256_RC128XOperand>; +def vy128xmem : X86VMemOperand<VR256X, "printxmmwordmem", X86Mem128_RC256XOperand>; +def vy256xmem : X86VMemOperand<VR256X, "printymmwordmem", X86Mem256_RC256XOperand>; +def vy512xmem : X86VMemOperand<VR256X, "printzmmwordmem", X86Mem512_RC256XOperand>; +def vz256mem : X86VMemOperand<VR512, "printymmwordmem", X86Mem256_RC512Operand>; +def vz512mem : X86VMemOperand<VR512, "printzmmwordmem", X86Mem512_RC512Operand>; // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead // of a plain GPR, so that it doesn't potentially require a REX prefix. @@ -409,7 +416,7 @@ def ptr_rc_norex : PointerLikeRegClass<2>; def ptr_rc_norex_nosp : PointerLikeRegClass<3>; def i8mem_NOREX : Operand<iPTR> { - let PrintMethod = "printi8mem"; + let PrintMethod = "printbytemem"; let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem8AsmOperand; @@ -424,7 +431,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<4>; // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i32mem_TC : Operand<i32> { - let PrintMethod = "printi32mem"; + let PrintMethod = "printdwordmem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem32AsmOperand; @@ -435,7 +442,7 @@ def i32mem_TC : Operand<i32> { // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i64mem_TC : Operand<i64> { - let PrintMethod = "printi64mem"; + let PrintMethod = "printqwordmem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem64AsmOperand; @@ -603,24 +610,10 @@ def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", X86MemOffs64_64AsmOperand>; -def SSECC : Operand<i8> { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AVXCC : Operand<i8> { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AVX512ICC : Operand<i8> { - let PrintMethod = "printSSEAVXCC"; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def XOPCC : Operand<i8> { - let PrintMethod = "printXOPCC"; - let OperandType = "OPERAND_IMMEDIATE"; +def ccode : Operand<i8> { + let PrintMethod = "printCondCode"; + let OperandNamespace = "X86"; + let OperandType = "OPERAND_COND_CODE"; } class ImmSExtAsmOperandClass : AsmOperandClass { @@ -640,7 +633,8 @@ def AVX512RCOperand : AsmOperandClass { } def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; - let OperandType = "OPERAND_IMMEDIATE"; + let OperandNamespace = "X86"; + let OperandType = "OPERAND_ROUNDING_CONTROL"; let ParserMatchClass = AVX512RCOperand; } @@ -718,6 +712,14 @@ def u8imm : Operand<i8> { let OperandType = "OPERAND_IMMEDIATE"; } +// 16-bit immediate but only 8-bits are significant and they are unsigned. +// Used by BT instructions. +def i16u8imm : Operand<i16> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 32-bit immediate but only 8-bits are significant and they are unsigned. // Used by some SSE/AVX instructions that use intrinsics. def i32u8imm : Operand<i32> { @@ -726,6 +728,14 @@ def i32u8imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } +// 64-bit immediate but only 8-bits are significant and they are unsigned. +// Used by BT instructions. +def i64u8imm : Operand<i64> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand<i64> { @@ -747,6 +757,33 @@ def lea64mem : Operand<i64> { let ParserMatchClass = X86MemAsmOperand; } +let RenderMethod = "addMaskPairOperands" in { + def VK1PairAsmOperand : AsmOperandClass { let Name = "VK1Pair"; } + def VK2PairAsmOperand : AsmOperandClass { let Name = "VK2Pair"; } + def VK4PairAsmOperand : AsmOperandClass { let Name = "VK4Pair"; } + def VK8PairAsmOperand : AsmOperandClass { let Name = "VK8Pair"; } + def VK16PairAsmOperand : AsmOperandClass { let Name = "VK16Pair"; } +} + +def VK1Pair : RegisterOperand<VK1PAIR, "printVKPair"> { + let ParserMatchClass = VK1PairAsmOperand; +} + +def VK2Pair : RegisterOperand<VK2PAIR, "printVKPair"> { + let ParserMatchClass = VK2PairAsmOperand; +} + +def VK4Pair : RegisterOperand<VK4PAIR, "printVKPair"> { + let ParserMatchClass = VK4PairAsmOperand; +} + +def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> { + let ParserMatchClass = VK8PairAsmOperand; +} + +def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> { + let ParserMatchClass = VK16PairAsmOperand; +} //===----------------------------------------------------------------------===// // X86 Complex Pattern Definitions. @@ -833,6 +870,8 @@ def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">; def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -894,8 +933,10 @@ def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; +def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; +def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, @@ -928,12 +969,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction().optForSize()">; - def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; + def OptForSize : Predicate<"MF->getFunction().hasOptSize()">; + def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " - "MF->getFunction().optForSize()">; - def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || " + "MF->getFunction().hasOptSize()">; + def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || " "!Subtarget->hasSSE41()">; } @@ -959,22 +1000,22 @@ include "X86InstrFormats.td" // X86 specific condition code. These correspond to CondCode in // X86InstrInfo.h. They must be kept in synch. -def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE -def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_O : PatLeaf<(i8 0)>; +def X86_COND_NO : PatLeaf<(i8 1)>; def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C -def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_AE : PatLeaf<(i8 3)>; // alt. COND_NC def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z -def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE -def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL -def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE -def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG -def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ -def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NE : PatLeaf<(i8 5)>; // alt. COND_NZ +def X86_COND_BE : PatLeaf<(i8 6)>; // alt. COND_NA +def X86_COND_A : PatLeaf<(i8 7)>; // alt. COND_NBE +def X86_COND_S : PatLeaf<(i8 8)>; +def X86_COND_NS : PatLeaf<(i8 9)>; +def X86_COND_P : PatLeaf<(i8 10)>; // alt. COND_PE def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO -def X86_COND_NS : PatLeaf<(i8 12)>; -def X86_COND_O : PatLeaf<(i8 13)>; -def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE -def X86_COND_S : PatLeaf<(i8 15)>; +def X86_COND_L : PatLeaf<(i8 12)>; // alt. COND_NGE +def X86_COND_GE : PatLeaf<(i8 13)>; // alt. COND_NL +def X86_COND_LE : PatLeaf<(i8 14)>; // alt. COND_NG +def X86_COND_G : PatLeaf<(i8 15)>; // alt. COND_NLE def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; @@ -1007,16 +1048,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ // Eventually, it would be nice to allow ConstantHoisting to merge constants // globally for potentially added savings. // -def imm8_su : PatLeaf<(i8 relocImm), [{ +def relocImm8_su : PatLeaf<(i8 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm16_su : PatLeaf<(i16 relocImm), [{ +def relocImm16_su : PatLeaf<(i16 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm32_su : PatLeaf<(i32 relocImm), [{ - return !shouldAvoidImmediateInstFormsForSize(N); -}]>; -def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ +def relocImm32_su : PatLeaf<(i32 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; @@ -1121,7 +1159,19 @@ def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; -def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + +// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known +// to be 4 byte aligned or better. +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType != ISD::EXTLOAD) + return false; + if (LD->getMemoryVT() == MVT::i32) + return true; + + return LD->getAlignment() >= 4 && !LD->isVolatile(); +}]>; // An 'and' node with a single use. @@ -1517,16 +1567,16 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm8_su:$src), addr:$dst)]>; + [(store (i8 relocImm8_su:$src), addr:$dst)]>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16; + [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32; + [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64immSExt32_su:$src, addr:$dst)]>, + [(store i64relocImmSExt32_su:$src, addr:$dst)]>, Requires<[In64BitMode]>; } // SchedRW @@ -1773,36 +1823,36 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteBitTestRegLd] in { } let SchedRW = [WriteBitTest] in { -def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16u8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, + [(set EFLAGS, (X86bt GR16:$src1, imm:$src2))]>, OpSize16, TB; -def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32u8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, + [(set EFLAGS, (X86bt GR32:$src1, imm:$src2))]>, OpSize32, TB; -def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64u8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; + [(set EFLAGS, (X86bt GR64:$src1, imm:$src2))]>, TB; } // SchedRW // Note that these instructions aren't slow because that only applies when the // other operand is in a register. When it's an immediate, bt is still fast. let SchedRW = [WriteBitTestImmLd] in { -def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi16 addr:$src1), - i16immSExt8:$src2))]>, + imm:$src2))]>, OpSize16, TB; -def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi32 addr:$src1), - i32immSExt8:$src2))]>, + imm:$src2))]>, OpSize32, TB; -def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), - i64immSExt8:$src2))]>, TB, + imm:$src2))]>, TB, Requires<[In64BitMode]>; } // SchedRW @@ -1832,20 +1882,20 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -1875,24 +1925,24 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -1922,20 +1972,20 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), } let SchedRW = [WriteBitTestSet], Constraints = "$src1 = $dst" in { -def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16u8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32u8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64u8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteBitTestSetImmRMW] in { -def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16u8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; -def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32u8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; -def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64u8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, Requires<[In64BitMode]>; } @@ -2090,12 +2140,13 @@ def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), - "cmpxchg8b\t$dst", []>, TB; + "cmpxchg8b\t$dst", []>, TB, Requires<[HasCmpxchg8b]>; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +// NOTE: In64BitMode check needed for the AssemblerPredicate. def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", []>, - TB, Requires<[HasCmpxchg16b, In64BitMode]>; + TB, Requires<[HasCmpxchg16b,In64BitMode]>; } // SchedRW, mayLoad, mayStore, hasSideEffects @@ -2388,6 +2439,11 @@ def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), return hasNoCarryFlagUses(SDValue(N, 1)); }]>; +def and_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs), + (X86and_flag node:$lhs, node:$rhs), [{ + return hasNoCarryFlagUses(SDValue(N, 1)); +}]>; + let Predicates = [HasBMI] in { // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, -1)), @@ -2406,12 +2462,20 @@ let Predicates = [HasBMI] in { (BLSI64rr GR64:$src)>; // Versions to match flag producing ops. - // X86and_flag nodes are rarely created. Those should use CMP+AND. We do - // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed. + def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, -1)), + (BLSR32rr GR32:$src)>; + def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, -1)), + (BLSR64rr GR64:$src)>; + def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)), (BLSMSK32rr GR32:$src)>; def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)), (BLSMSK64rr GR64:$src)>; + + def : Pat<(and_flag_nocf GR32:$src, (ineg GR32:$src)), + (BLSI32rr GR32:$src)>; + def : Pat<(and_flag_nocf GR64:$src, (ineg GR64:$src)), + (BLSI64rr GR64:$src)>; } multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC, @@ -2653,16 +2717,12 @@ defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W; // MONITORX/MWAITX Instructions // let SchedRW = [ WriteSystem ] in { - let usesCustomInserter = 1 in { - def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), - [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>, - Requires<[ HasMWAITX ]>; - } - - let Uses = [ EAX, ECX, EDX ] in { - def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, - TB, Requires<[ HasMWAITX ]>; - } + let Uses = [ EAX, ECX, EDX ] in + def MONITORX32rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX, Not64BitMode ]>; + let Uses = [ RAX, ECX, EDX ] in + def MONITORX64rrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX, In64BitMode ]>; let Uses = [ ECX, EAX, EBX ] in { def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", @@ -2676,9 +2736,9 @@ def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>, def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>, Requires<[ In64BitMode ]>; -def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORX32rrr)>, Requires<[ Not64BitMode ]>; -def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORX64rrr)>, Requires<[ In64BitMode ]>; //===----------------------------------------------------------------------===// @@ -2738,21 +2798,50 @@ def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), } // SchedRW //===----------------------------------------------------------------------===// +// ENQCMD/S - Enqueue 64-byte command as user with 64-byte write atomicity +// +let SchedRW = [WriteStore], Defs = [EFLAGS] in { + def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>, + T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>; + def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>, + T8XD, AdSize32, Requires<[HasENQCMD]>; + def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "enqcmd\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>, + T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>; + + def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>, + T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>; + def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>, + T8XS, AdSize32, Requires<[HasENQCMD]>; + def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "enqcmds\t{$src, $dst|$dst, $src}", + [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>, + T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>; +} + +//===----------------------------------------------------------------------===// // CLZERO Instruction // let SchedRW = [WriteSystem] in { let Uses = [EAX] in - def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, - TB, Requires<[HasCLZERO]>; - - let usesCustomInserter = 1 in { - def CLZERO : PseudoI<(outs), (ins i32mem:$src1), - [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; - } + def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO, Not64BitMode]>; + let Uses = [RAX] in + def CLZERO64r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO, In64BitMode]>; } // SchedRW -def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; -def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; +def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>; +def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. @@ -2812,8 +2901,6 @@ let Predicates = [HasTBM] in { (TZMSK64rr GR64:$src)>; // Patterns to match flag producing ops. - // X86and_flag nodes are rarely created. Those should use CMP+AND. We do - // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed. def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))), (BLCI32rr GR32:$src)>; def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))), @@ -2825,6 +2912,11 @@ let Predicates = [HasTBM] in { def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)), (BLCI64rr GR64:$src)>; + def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)), (BLCMSK32rr GR32:$src)>; def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)), @@ -2849,6 +2941,11 @@ let Predicates = [HasTBM] in { (T1MSKC32rr GR32:$src)>; def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)), (T1MSKC64rr GR64:$src)>; + + def : Pat<(and_flag_nocf (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and_flag_nocf (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; } // HasTBM //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index 8f3357170576..57835b1a256a 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -1,9 +1,8 @@ //===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -153,7 +152,9 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, // MMX EMMS Instruction //===----------------------------------------------------------------------===// -let SchedRW = [WriteEMMS] in +let SchedRW = [WriteEMMS], + Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>; //===----------------------------------------------------------------------===// @@ -544,7 +545,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td index c1a8cc7c5fbf..f7d931510fe2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td @@ -1,9 +1,8 @@ //===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td index 488cc4438076..747f5aa86653 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSGX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td @@ -1,9 +1,8 @@ //===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index ddfc369b1180..7d0a5b87baf4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -1,9 +1,8 @@ //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,6 +21,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, Domain d, X86FoldableSchedWrite sched, bit Is2Addr = 1> { +let isCodeGenOnly = 1 in { let isCommutable = 1 in { def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -37,6 +37,7 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, @@ -44,7 +45,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, ValueType VT, string asm, Operand memopr, ComplexPattern mem_cpat, Domain d, X86FoldableSchedWrite sched, bit Is2Addr = 1> { -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let hasSideEffects = 0 in { def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), @@ -224,16 +225,29 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, } // Loading from memory automatically zeroing upper bits. -multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_pat, string OpcodeStr, Domain d> { - def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), +multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, + PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, + Domain d> { + def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; - def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, Sched<[WriteFLoad]>; + + // _alt version uses FR32/FR64 register class. + let isCodeGenOnly = 1 in { + def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; + def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + Sched<[WriteFLoad]>; + } } defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", @@ -242,49 +256,25 @@ defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", SSEPackedDouble, "MOVSD", UseSSE2>, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { - defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", + defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", SSEPackedSingle>, XS; - defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", + defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", SSEPackedDouble>, XD; } // Patterns let Predicates = [UseAVX] in { - // MOVSSrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; - - // MOVSDrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (VMOVSSrm addr:$src)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (VMOVSDrm addr:$src)>; // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (X86vzload addr:$src)), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; } let Predicates = [UseAVX, OptForSize] in { @@ -304,59 +294,24 @@ let Predicates = [UseAVX, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSrr (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), - sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDrr (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), - sub_xmm)>; } -let Predicates = [UseSSE1] in { - let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVSS to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; - } - - // MOVSSrm already zeros the high parts of the register. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - def : Pat<(v4f32 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; - - // Extract and store. - def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; +let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { +// Move scalar to XMM zero-extended, zeroing a VR128 then do a +// MOVSS to the lower bits. +def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; +def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; } -let Predicates = [UseSSE2] in { - // MOVSDrm already zeros the high parts of the register. - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; -} - -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; -def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; +let Predicates = [UseSSE2] in +def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (MOVSDrm addr:$src)>; + +let Predicates = [UseSSE1] in +def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (MOVSSrm addr:$src)>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions @@ -504,25 +459,6 @@ let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { } // SchedRW } // Predicate -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", - (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", - (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", - (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", - (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", - (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", - (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", - (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", - (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; - // Reversed version with ".s" suffix for GAS compatibility. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; @@ -700,10 +636,10 @@ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; let SchedRW = [WriteFStore] in { let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>, + []>, VEX, VEX_WIG; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", @@ -711,10 +647,10 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; }// UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>; + []>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 VR128:$src), @@ -722,16 +658,19 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), } // SchedRW let Predicates = [UseSSE1] in { - // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), - (iPTR 0))), addr:$src1), - (MOVLPSmr addr:$src1, VR128:$src2)>; - // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)), + def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, + (i8 -28)), (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(v4f32 (X86vzload64 addr:$src)), + (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; + def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), + (MOVLPSmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -744,24 +683,20 @@ let SchedRW = [WriteFStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; + []>, VEX, VEX_WIG; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; } // UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>; + []>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt @@ -775,19 +710,31 @@ let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; + + // MOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE1] in { // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)), + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), + addr:$dst), + (MOVHPSmr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { @@ -798,11 +745,24 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), + (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; + + // MOVLPD patterns + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { + // Use MOVLPD to load into the low bits from a full vector unless we can use + // BLENDPD. + def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; } //===----------------------------------------------------------------------===// @@ -847,13 +807,16 @@ let Constraints = "$src1 = $dst" in { multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm, X86FoldableSchedWrite sched> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (OpNode SrcRC:$src))]>, - Sched<[sched]>; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, - Sched<[sched.Folded]>; + string asm, string mem, X86FoldableSchedWrite sched, + SchedRead Int2Fpu = ReadDefault> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpNode SrcRC:$src))]>, + Sched<[sched, Int2Fpu]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + mem#"\t{$src, $dst|$dst, $src}", + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, + Sched<[sched.Folded]>; } multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, @@ -872,74 +835,55 @@ let hasSideEffects = 0 in { } multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm, + X86MemOperand x86memop, string asm, string mem, X86FoldableSchedWrite sched> { let hasSideEffects = 0, Predicates = [UseAVX] in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - Sched<[sched]>; + Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // hasSideEffects = 0 } -let Predicates = [UseAVX] in { +let isCodeGenOnly = 1, Predicates = [UseAVX] in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; - -def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; -def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; -def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; -def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; -def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; } + // The assembler can recognize rr 64-bit instructions by seeing a rxx // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly // where appropriate to do so. -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", +let isCodeGenOnly = 1 in { +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; -defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", +defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; -defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", +defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; +} // isCodeGenOnly = 1 let Predicates = [UseAVX] in { - def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; - def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", - (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; - def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -959,52 +903,32 @@ let Predicates = [UseAVX] in { (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } +let isCodeGenOnly = 1 in { defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS; defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}", + "cvttss2si", "cvttss2si", WriteCvtSS2I>, XS, REX_W; defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD; defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}", + "cvttsd2si", "cvttsd2si", WriteCvtSD2I>, XD, REX_W; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, - "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", - WriteCvtI2SS>, XS; + "cvtsi2ss", "cvtsi2ss{l}", + WriteCvtI2SS, ReadInt2Fpu>, XS; defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, - "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", - WriteCvtI2SS>, XS, REX_W; + "cvtsi2ss", "cvtsi2ss{q}", + WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W; defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, - "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", - WriteCvtI2SD>, XD; + "cvtsi2sd", "cvtsi2sd{l}", + WriteCvtI2SD, ReadInt2Fpu>, XD; defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, - "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", - WriteCvtI2SD>, XD, REX_W; - -def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; -def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; -def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; -def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; -def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; -def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; -def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; - -def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", - (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">; -def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", - (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">; + "cvtsi2sd", "cvtsi2sd{q}", + WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W; +} // isCodeGenOnly = 1 // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). @@ -1025,20 +949,20 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, - string asm, X86FoldableSchedWrite sched, + string asm, string mem, X86FoldableSchedWrite sched, bit Is2Addr = 1> { let hasSideEffects = 0 in { def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - []>, Sched<[sched]>; + []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; let mayLoad = 1 in def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src2), !if(Is2Addr, - !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", + asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -1057,48 +981,73 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; -let isCodeGenOnly = 1 in { - let Predicates = [UseAVX] in { - defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V; - defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W; - defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V; - defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W; - } - let Constraints = "$src1 = $dst" in { - defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS; - defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W; - defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD; - defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W; - } -} // isCodeGenOnly = 1 +let Predicates = [UseAVX] in { +defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG; +defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W; +defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG; +defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W; +} +let Constraints = "$src1 = $dst" in { + defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS; + defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W; + defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD; + defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W; +} + +def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; +def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; +def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; +def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; + +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; + +def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", + (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; +def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; +def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", + (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; +def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; /// SSE 1 Only // Aliases for intrinsics -let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", - WriteCvtSS2I>, XS, VEX; + WriteCvtSS2I>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", WriteCvtSS2I>, - XS, VEX, VEX_W; + XS, VEX, VEX_LIG, VEX_W; defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", - WriteCvtSS2I>, XD, VEX; + WriteCvtSS2I>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSS2I>, - XD, VEX, VEX_W; + XD, VEX, VEX_LIG, VEX_W; } defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", @@ -1112,7 +1061,40 @@ defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", WriteCvtSD2I>, XD, REX_W; -} // isCodeGenOnly = 1 + +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; + +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; let Predicates = [UseAVX] in { defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, @@ -1143,7 +1125,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, SSEPackedSingle, WriteCvtI2PS>, PS, Requires<[UseSSE2]>; -let Predicates = [UseAVX] in { +// AVX aliases def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1160,8 +1142,8 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; -} +// SSE aliases def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1182,7 +1164,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let hasSideEffects = 0, Predicates = [UseAVX] in { +let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1200,6 +1182,7 @@ def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; +let isCodeGenOnly = 1 in { def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fpround FR64:$src))]>, @@ -1209,42 +1192,41 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, XD, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtSD2SS.Folded]>; +} -let isCodeGenOnly = 1 in { def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, - XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, + XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, Sched<[WriteCvtSD2SS]>; def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (int_x86_sse2_cvtsd2ss - VR128:$src1, sse_load_f64:$src2))]>, - XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + [(set VR128:$dst, + (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>, + XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, + (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (int_x86_sse2_cvtsd2ss - VR128:$src1, sse_load_f64:$src2))]>, + [(set VR128:$dst, + (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } -} // isCodeGenOnly = 1 // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1257,51 +1239,36 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), XS, VEX_4V, VEX_LIG, VEX_WIG, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, Requires<[UseAVX, OptForSize]>; -} +} // isCodeGenOnly = 1, hasSideEffects = 0 def : Pat<(f64 (fpextend FR32:$src)), (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[UseAVX, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[UseAVX, OptForSpeed]>; - +let isCodeGenOnly = 1 in { def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fpextend FR32:$src))]>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (extloadf32 addr:$src))]>, + [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>, XS, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtSS2SD.Folded]>; +} // isCodeGenOnly = 1 -// extload f32 -> f64. This matches load+fpextend because we have a hack in -// the isel (PreprocessForFPConvert) that can introduce loads after dag -// combine. -// Since these loads aren't folded into the fpextend, we have to match it -// explicitly here. -def : Pat<(fpextend (loadf32 addr:$src)), - (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; - -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let hasSideEffects = 0 in { def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, VEX_WIG, + []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; let mayLoad = 1 in def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, @@ -1316,7 +1283,7 @@ def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, []>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; } -} // isCodeGenOnly = 1 +} // hasSideEffects = 0 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary @@ -1476,15 +1443,11 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; // XMM only -def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; -def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; // YMM only def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), @@ -1497,12 +1460,13 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; -def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; -def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; } +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; + def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1540,17 +1504,6 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src) Sched<[WriteCvtPS2IYLd]>, VEX_WIG; } -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (VCVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2DQrm addr:$src)>; - def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), - (VCVTTPS2DQYrr VR256:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2DQYrm addr:$src)>; -} - def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1562,39 +1515,23 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, Sched<[WriteCvtPS2ILd]>; -let Predicates = [UseSSE2] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (CVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), - (CVTTPS2DQrm addr:$src)>; -} - -let Predicates = [HasAVX, NoVLX] in +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +let Predicates = [HasAVX, NoVLX] in { +// XMM only def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; - -// The assembler can recognize rr 256-bit instructions by seeing a ymm -// register, but the same isn't true when using memory operands instead. -// Provide other assembly rr and rm forms to address this explicitly. - -// XMM only -def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; - -let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; -def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; // YMM only -let Predicates = [HasAVX, NoVLX] in { def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1605,11 +1542,12 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), [(set VR128:$dst, (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; -} -def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; +} // Predicates = [HasAVX, NoVLX] + +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", - (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), @@ -1618,21 +1556,6 @@ let Predicates = [HasAVX, NoVLX] in { (VCVTTPD2DQYrm addr:$src)>; } -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (VCVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), - (VCVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), - (VCVTTPD2DQrm addr:$src)>; -} // Predicates = [HasAVX, NoVLX] - def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -1644,21 +1567,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, Sched<[WriteCvtPD2ILd]>; -let Predicates = [UseSSE2] in { - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (CVTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), - (CVTPD2DQrm addr:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (CVTTPD2DQrr VR128:$src)>; - def : Pat<(X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), - (CVTTPD2DQrm addr:$src)>; -} // Predicates = [UseSSE2] - // Convert packed single to packed double let Predicates = [HasAVX, NoVLX] in { // SSE2 instructions without OpSize prefix @@ -1697,7 +1605,10 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, + (v2f64 (X86VSintToFP + (bc_v4i32 + (v2i64 (scalar_to_vector + (loadi64 addr:$src)))))))]>, VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1721,7 +1632,10 @@ let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, + (v2f64 (X86VSintToFP + (bc_v4i32 + (v2i64 (scalar_to_vector + (loadi64 addr:$src)))))))]>, Sched<[WriteCvtI2PDLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1731,17 +1645,13 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // AVX register conversion intrinsics let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (VCVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), - (CVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (CVTDQ2PDrm addr:$src)>; } // Predicates = [UseSSE2] @@ -1749,38 +1659,31 @@ let Predicates = [UseSSE2] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. -let Predicates = [HasAVX, NoVLX] in +let Predicates = [HasAVX, NoVLX] in { +// XMM only def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; - -// XMM only -def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; -let Predicates = [HasAVX, NoVLX] in def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; -def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", - (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">; -// YMM only -let Predicates = [HasAVX, NoVLX] in { def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (fpround VR256:$src))]>, + [(set VR128:$dst, (X86vfpround VR256:$src))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>, + [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; -} -def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; +} // Predicates = [HasAVX, NoVLX] + +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", - (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">; + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", @@ -1791,28 +1694,11 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, Sched<[WriteCvtPD2PS.Folded]>; -// AVX 256-bit register conversion intrinsics -// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below -// whenever possible to avoid declaring two versions of each one. - let Predicates = [HasAVX, NoVLX] in { - // Match fpround and fpextend for 128/256-bit conversions - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (VCVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), - (VCVTPD2PSrm addr:$src)>; -} - -let Predicates = [UseSSE2] in { - // Match fpround and fpextend for 128 conversions - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (CVTPD2PSrr VR128:$src)>; - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), - (CVTPD2PSrm addr:$src)>; + def : Pat<(v4f32 (fpround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1821,94 +1707,80 @@ let Predicates = [UseSSE2] in { // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - Operand CC, SDNode OpNode, ValueType VT, - PatFrag ld_frag, string asm, string asm_alt, + SDNode OpNode, ValueType VT, + PatFrag ld_frag, string asm, X86FoldableSchedWrite sched> { let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, Sched<[sched]>; def rm : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), (ld_frag addr:$src2), imm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; - - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>, - Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, - Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable; - } } -let ExeDomain = SSEPackedSingle in -defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, - "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; -let ExeDomain = SSEPackedDouble in -defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, - "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SchedWriteFCmpSizes.PD.Scl>, - XD, VEX_4V, VEX_LIG, VEX_WIG; - -let Constraints = "$src1 = $dst" in { +let isCodeGenOnly = 1 in { let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, - "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SchedWriteFCmpSizes.PS.Scl>, XS; + defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, - "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", - "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SchedWriteFCmpSizes.PD.Scl>, XD; + defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl>, + XD, VEX_4V, VEX_LIG, VEX_WIG; + + let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl>, XS; + let ExeDomain = SSEPackedDouble in + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl>, XD; + } } -multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, +multiclass sse12_cmp_scalar_int<Operand memop, Intrinsic Int, string asm, X86FoldableSchedWrite sched, ComplexPattern mem_cpat> { def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src, CC:$cc), asm, + (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, VR128:$src, imm:$cc))]>, Sched<[sched]>; let mayLoad = 1 in def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, memop:$src, CC:$cc), asm, + (ins VR128:$src1, memop:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, mem_cpat:$src, imm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -let isCodeGenOnly = 1 in { - // Aliases to match intrinsics which expect XMM operand(s). +// Aliases to match intrinsics which expect XMM operand(s). +let ExeDomain = SSEPackedSingle in +defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, + "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, + XS, VEX_4V, VEX_LIG, VEX_WIG; +let ExeDomain = SSEPackedDouble in +defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, + "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, + XD, VEX_4V, VEX_LIG, VEX_WIG; +let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in - defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V; + defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, + "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; let ExeDomain = SSEPackedDouble in - defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, - XD, VEX_4V; - let Constraints = "$src1 = $dst" in { - let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; - let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; -} + defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, + "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; } @@ -1962,14 +1834,14 @@ let Defs = [EFLAGS] in { let isCodeGenOnly = 1 in { defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG; + sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG; + sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG; + sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG; + sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", WriteFCom>, PS; @@ -1998,56 +1870,38 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, - Operand CC, ValueType VT, string asm, - string asm_alt, X86FoldableSchedWrite sched, + ValueType VT, string asm, + X86FoldableSchedWrite sched, Domain d, PatFrag ld_frag> { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, Sched<[sched]>; def rmi : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; - - // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), - asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } } -defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, - "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; -defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, - "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; -defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, - "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; -defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, - "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in { - defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, - "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", + defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; - defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, - "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", + defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; } @@ -2111,12 +1965,14 @@ let Predicates = [UseSSE1] in { /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, - X86FoldableSchedWrite sched, Domain d> { + X86FoldableSchedWrite sched, Domain d, + bit IsCommutable = 0> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, @@ -2148,7 +2004,7 @@ let Constraints = "$src1 = $dst" in { memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; + memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; } //===----------------------------------------------------------------------===// @@ -2238,6 +2094,13 @@ let Predicates = [HasAVX1Only] in { (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } +let Predicates = [UseSSE2] in { + // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (v2f64 (nonvolatile_load addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Extract Floating-Point Sign mask //===----------------------------------------------------------------------===// @@ -2523,99 +2386,6 @@ let Predicates = [HasAVX1Only] in { (VANDNPSYrm VR256:$src1, addr:$src2)>; } -let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - - def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; -} - -let Predicates = [UseSSE1] in { - // Use packed logical operations for scalar ops. - def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; - def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), - (COPY_TO_REGCLASS - (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), - (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), - FR32)>; -} - -let Predicates = [UseSSE2] in { - // Use packed logical operations for scalar ops. - def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; - def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), - (COPY_TO_REGCLASS - (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), - (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), - FR64)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), (VPANDrr VR128:$src1, VR128:$src2)>; @@ -2908,7 +2678,8 @@ let isCodeGenOnly = 1 in { // patterns we have to try to match. multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, ValueType VT, ValueType EltTy, - RegisterClass RC, Predicate BasePredicate> { + RegisterClass RC, PatFrag ld_frag, + Predicate BasePredicate> { let Predicates = [BasePredicate] in { // extracted scalar math op with insert via movss/movsd def : Pat<(VT (Move (VT VR128:$dst), @@ -2917,6 +2688,11 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, RC:$src))))), (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } // Repeat for AVX versions of the instructions. @@ -2928,18 +2704,23 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, RC:$src))))), (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } } -defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>; +defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; +defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to @@ -2956,7 +2737,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType ScalarVT, X86MemOperand x86memop, Operand intmemop, SDNode OpNode, Domain d, X86FoldableSchedWrite sched, Predicate target> { - let hasSideEffects = 0 in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, @@ -2967,8 +2748,9 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (OpNode (load addr:$src1)))], d>, Sched<[sched.Folded]>, Requires<[target, OptForSize]>; + } - let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { + let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[sched]>; @@ -2977,7 +2759,6 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - } } @@ -3022,7 +2803,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType ScalarVT, X86MemOperand x86memop, Operand intmemop, SDNode OpNode, Domain d, X86FoldableSchedWrite sched, Predicate target> { - let hasSideEffects = 0 in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [], d>, Sched<[sched]>; @@ -3030,7 +2811,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCodeGenOnly = 1, ExeDomain = d in { + } + let hasSideEffects = 0, ExeDomain = d in { def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3041,7 +2823,6 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - } // We don't want to fold scalar loads into these instructions unless // optimizing for size. This is because the folded instruction will have a @@ -3197,23 +2978,6 @@ multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Mo } } -multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, - ValueType VT, bits<8> ImmV, - Predicate BasePredicate> { - let Predicates = [BasePredicate] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, 0))))), - (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; - } - - // Repeat for AVX versions of the instructions. - let Predicates = [UseAVX] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, 0))))), - (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; - } -} - defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; @@ -3388,16 +3152,20 @@ def : Pat<(X86MFence), (MFENCE)>; // SSE 1 & 2 - Load/Store XCSR register //===----------------------------------------------------------------------===// +let mayLoad=1, hasSideEffects=1 in def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; +let mayStore=1, hasSideEffects=1 in def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; +let mayLoad=1, hasSideEffects=1 in def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, TB, Sched<[WriteLDMXCSR]>; +let mayStore=1, hasSideEffects=1 in def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, TB, Sched<[WriteSTMXCSR]>; @@ -3529,17 +3297,6 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // ExeDomain = SSEPackedInt -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", - (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", - (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>; -def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", - (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>; -def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", - (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; - // Reversed version with ".s" suffix for GAS compatibility. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; @@ -4118,7 +3875,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -4138,7 +3895,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, - PD, VEX, Sched<[WriteVecExtract]>; + PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4148,7 +3905,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, // Insert let Predicates = [HasAVX, NoBWI] in -defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; +defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in defm PINSRW : sse2_pinsrw, PD; @@ -4279,19 +4036,11 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX, Sched<[WriteVecMoveFromGpr]>; - def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, - VEX, Sched<[WriteVecLoad]>; def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>, Sched<[WriteVecMoveFromGpr]>; - def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, - Sched<[WriteVecLoad]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// @@ -4353,32 +4102,15 @@ def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), // Bitcast FR64 <-> GR64 // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { - let Predicates = [UseAVX] in - def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, - VEX, Sched<[WriteVecLoad]>; def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))]>, VEX, Sched<[WriteVecMoveToGpr]>; - def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), - "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, - VEX, Sched<[WriteVecStore]>; - def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, - Sched<[WriteVecLoad]>; def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))]>, Sched<[WriteVecMoveToGpr]>; - def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), - "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, - Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// @@ -4389,18 +4121,10 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX, Sched<[WriteVecMoveToGpr]>; - def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, - VEX, Sched<[WriteVecStore]>; def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, Sched<[WriteVecMoveToGpr]>; - def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, - Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 let Predicates = [UseAVX] in { @@ -4410,28 +4134,14 @@ let Predicates = [UseAVX] in { def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (VMOV64toPQIrr GR64:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), - (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; - // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; } let Predicates = [UseSSE2] in { @@ -4442,11 +4152,7 @@ let Predicates = [UseSSE2] in { (MOV64toPQIrr GR64:$src)>; def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (MOVDI2PDIrm addr:$src)>; } @@ -4508,32 +4214,26 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", []>; } -// Aliases to help the assembler pick two byte VEX encodings by swapping the -// operands relative to the normal instructions to use VEX.R instead of VEX.B. -def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", - (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; - def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; let Predicates = [UseAVX] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; + + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (MOVQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; + + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), + (MOVPQI2QImr addr:$dst, VR128:$src)>; } //===---------------------------------------------------------------------===// @@ -4560,6 +4260,19 @@ let Predicates = [UseSSE2] in { (MOVZPQILo2PQIrr VR128:$src)>; } +let Predicates = [UseAVX] in { + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -4667,17 +4380,17 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86Movddup (loadv2f64 addr:$src)), + def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } let Predicates = [UseSSE3] in { // No need for aligned memory as this only loads 64-bits. - def : Pat<(X86Movddup (loadv2f64 addr:$src)), + def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; } @@ -5130,15 +4843,12 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in //===---------------------------------------------------------------------===// let SchedRW = [WriteSystem] in { -let usesCustomInserter = 1 in { -def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), - [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, - Requires<[HasSSE3]>; -} - let Uses = [EAX, ECX, EDX] in -def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, - TB, Requires<[HasSSE3]>; +def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, + TB, Requires<[HasSSE3, Not64BitMode]>; +let Uses = [RAX, ECX, EDX] in +def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, + TB, Requires<[HasSSE3, In64BitMode]>; let Uses = [ECX, EAX] in def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", @@ -5148,13 +4858,14 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; -def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, Requires<[Not64BitMode]>; -def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // SSE4.1 - Packed Move with Sign/Zero Extend +// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp //===----------------------------------------------------------------------===// multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, @@ -5202,71 +4913,38 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// Patterns that we also need for any_extend. -// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. -multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> { - // Register-Register patterns - let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; - } - - let Predicates = [HasAVX2, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; - } - - // AVX2 Register-Memory patterns - let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - } - - let Predicates = [HasAVX2, NoVLX] in { - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - } -} - // AVX2 Patterns multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, - SDNode ExtOp, SDNode InVecOp> : - SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> { - + SDNode ExtOp, SDNode InVecOp> { // Register-Register patterns + let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; } + let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; @@ -5284,38 +4962,31 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, // AVX2 Register-Memory patterns let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; } } defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; -defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, @@ -5361,9 +5032,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; @@ -5371,19 +5040,13 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), (!cast<I>(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), (!cast<I>(OpcPrefix#BQrm) addr:$src)>; @@ -5391,18 +5054,14 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), - (!cast<I>(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), (!cast<I>(OpcPrefix#WQrm) addr:$src)>; @@ -5411,9 +5070,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, (!cast<I>(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), (!cast<I>(OpcPrefix#DQrm) addr:$src)>; @@ -5451,7 +5108,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { } let Predicates = [HasAVX, NoBWI] in - defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -5475,7 +5132,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { } let Predicates = [HasAVX, NoBWI] in - defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -5548,18 +5205,6 @@ let ExeDomain = SSEPackedSingle in { defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; } -// Also match an EXTRACTPS store when the store is done as f32 instead of i32. -def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), - imm:$src2))), - addr:$dst), - (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasAVX]>; -def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), - imm:$src2))), - addr:$dst), - (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[UseSSE41]>; - //===----------------------------------------------------------------------===// // SSE4.1 - Insert Instructions //===----------------------------------------------------------------------===// @@ -5573,7 +5218,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5586,7 +5231,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { } let Predicates = [HasAVX, NoBWI] in - defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -5599,7 +5244,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5625,7 +5270,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2, u8imm:$src3), !if(Is2Addr, @@ -5647,6 +5292,7 @@ let Constraints = "$src1 = $dst" in // vector. The next one matches the intrinsic and could zero arbitrary elements // in the target vector. multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { + let isCommutable = 1 in def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, u8imm:$src3), !if(Is2Addr, @@ -5853,7 +5499,7 @@ let Predicates = [HasAVX, NoVLX] in { VEX, VEX_L, VEX_WIG; } } -let Predicates = [HasAVX, NoAVX512] in { +let Predicates = [UseAVX] in { defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG; @@ -5862,141 +5508,17 @@ let Predicates = [HasAVX, NoAVX512] in { } let Predicates = [UseAVX] in { - def : Pat<(ffloor FR32:$src), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; - def : Pat<(f32 (fceil FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; - def : Pat<(f32 (frint FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; - def : Pat<(f64 (fceil FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; - def : Pat<(f64 (frint FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + def : Pat<(X86VRndScale FR32:$src1, imm:$src2), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, imm:$src2), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { - def : Pat<(ffloor (loadf32 addr:$src)), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; - def : Pat<(f32 (fceil (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; - def : Pat<(f32 (frint (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc (loadf32 addr:$src))), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; - def : Pat<(f64 (fceil (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; - def : Pat<(f64 (frint (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc (loadf64 addr:$src))), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; -} - -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (ffloor VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0xB))>; - - def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))), - (VROUNDPSm addr:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))), - (VROUNDPDm addr:$src, (i32 0xB))>; - - def : Pat<(v8f32 (ffloor VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0x9))>; - def : Pat<(v8f32 (fnearbyint VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xC))>; - def : Pat<(v8f32 (fceil VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xA))>; - def : Pat<(v8f32 (frint VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0x4))>; - def : Pat<(v8f32 (ftrunc VR256:$src)), - (VROUNDPSYr VR256:$src, (i32 0xB))>; - - def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0x9))>; - def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xC))>; - def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xA))>; - def : Pat<(v8f32 (frint (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0x4))>; - def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))), - (VROUNDPSYm addr:$src, (i32 0xB))>; - - def : Pat<(v4f64 (ffloor VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0x9))>; - def : Pat<(v4f64 (fnearbyint VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xC))>; - def : Pat<(v4f64 (fceil VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xA))>; - def : Pat<(v4f64 (frint VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0x4))>; - def : Pat<(v4f64 (ftrunc VR256:$src)), - (VROUNDPDYr VR256:$src, (i32 0xB))>; - - def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0x9))>; - def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xC))>; - def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xA))>; - def : Pat<(v4f64 (frint (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0x4))>; - def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))), - (VROUNDPDYm addr:$src, (i32 0xB))>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; } let ExeDomain = SSEPackedSingle in @@ -6013,108 +5535,19 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { - def : Pat<(ffloor FR32:$src), - (ROUNDSSr FR32:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xC))>; - def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xA))>; - def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr FR32:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xC))>; - def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xA))>; - def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr FR64:$src, (i32 0xB))>; + def : Pat<(X86VRndScale FR32:$src1, imm:$src2), + (ROUNDSSr FR32:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, imm:$src2), + (ROUNDSDr FR64:$src1, imm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { - def : Pat<(ffloor (loadf32 addr:$src)), - (ROUNDSSm addr:$src, (i32 0x9))>; - def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xC))>; - def : Pat<(f32 (fceil (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xA))>; - def : Pat<(f32 (frint (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc (loadf32 addr:$src))), - (ROUNDSSm addr:$src, (i32 0xB))>; - - def : Pat<(f64 (ffloor (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0x9))>; - def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xC))>; - def : Pat<(f64 (fceil (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xA))>; - def : Pat<(f64 (frint (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0x4))>; - def : Pat<(f64 (ftrunc (loadf64 addr:$src))), - (ROUNDSDm addr:$src, (i32 0xB))>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), + (ROUNDSSm addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), + (ROUNDSDm addr:$src1, imm:$src2)>; } -let Predicates = [UseSSE41] in { - def : Pat<(v4f32 (ffloor VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0xB))>; - - def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0x9))>; - def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xC))>; - def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xA))>; - def : Pat<(v4f32 (frint (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0x4))>; - def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))), - (ROUNDPSm addr:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0xB))>; - - def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0x9))>; - def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xC))>; - def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xA))>; - def : Pat<(v2f64 (frint (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0x4))>; - def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))), - (ROUNDPDm addr:$src, (i32 0xB))>; -} - -defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss, - v4f32, 0x01, UseSSE41>; -defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss, - v4f32, 0x02, UseSSE41>; -defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd, - v2f64, 0x01, UseSSE41>; -defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd, - v2f64, 0x02, UseSSE41>; - //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// @@ -6449,6 +5882,72 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{ return getI8Imm(Imm ^ 0xff, SDLoc(N)); }]>; +// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. +def BlendScaleImm4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 4; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. +def BlendScaleImm2 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0xf << (i * 4); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. +def BlendScaleImm2to4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. +def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 4; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm ^ 0xff, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. +def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0xf << (i * 4); + } + return getI8Imm(NewImm ^ 0xff, SDLoc(N)); +}]>; + +// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. +def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + uint8_t NewImm = 0; + for (unsigned i = 0; i != 2; ++i) { + if (Imm & (1 << i)) + NewImm |= 0x3 << (i * 2); + } + return getI8Imm(NewImm ^ 0xf, SDLoc(N)); +}]>; + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6559,6 +6058,42 @@ let Predicates = [HasAVX2] in { VEX_4V, VEX_L, VEX_WIG; } +// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. +// ExecutionDomainFixPass will cleanup domains later on. +let Predicates = [HasAVX1Only] in { +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; + +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movsd via commuting under optsize. +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; + +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; + +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movss via commuting under optsize. +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +} + defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; @@ -6569,6 +6104,24 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; +let Predicates = [UseSSE41] in { +// Use pblendw for 128-bit integer to keep it in the integer domain and prevent +// it from becoming movss via commuting under optsize. +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; +def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; + +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +} + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -6580,18 +6133,25 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), + (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xc)>; +def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } -/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, Intrinsic IntId, - X86FoldableSchedWrite sched> { +/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag mem_frag, SDNode OpNode, + X86FoldableSchedWrite sched> { def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], + [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched]>; @@ -6600,8 +6160,8 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (mem_frag addr:$src2), - RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, + (OpNode RC:$src3, (mem_frag addr:$src2), + RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -6612,68 +6172,47 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - load, int_x86_sse41_blendvpd, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - loadv4f64, int_x86_avx_blendv_pd_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, + v2f64, loadv2f64, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, + v4f64, loadv4f64, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - load, int_x86_sse41_blendvps, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - loadv8f32, int_x86_avx_blendv_ps_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, + v4f32, loadv4f32, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, + v8f32, loadv8f32, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - load, int_x86_sse41_pblendvb, - SchedWriteVarBlend.XMM>; +defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, + v16i8, loadv16i8, X86Blendv, + SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { -defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - load, int_x86_avx2_pblendvb, - SchedWriteVarBlend.YMM>, VEX_L; +defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, + v32i8, loadv32i8, X86Blendv, + SchedWriteVarBlend.YMM>, VEX_L; } let Predicates = [HasAVX] in { - def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), - (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), - (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), - (v8i32 VR256:$src2))), - (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), - (v8f32 VR256:$src2))), + def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), - (v4i64 VR256:$src2))), - (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), - (v4f64 VR256:$src2))), + def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -let Predicates = [HasAVX2] in { - def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; -} - // Prefer a movss or movsd over a blendps when optimizing for size. these were // changed to use blends because blends have better throughput on sandybridge // and haswell, but movs[s/d] are 1-2 byte shorter instructions. @@ -6708,17 +6247,6 @@ let Predicates = [HasAVX, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were @@ -6747,16 +6275,17 @@ let Predicates = [UseSSE41, OptForSpeed] in { } -/// SS41I_ternary_int - SSE 4.1 ternary operator +/// SS41I_ternary - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - X86MemOperand x86memop, Intrinsic IntId, - X86FoldableSchedWrite sched> { + multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, X86FoldableSchedWrite sched> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, + [(set VR128:$dst, + (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, Sched<[sched]>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), @@ -6764,20 +6293,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), [(set VR128:$dst, - (IntId VR128:$src1, - (mem_frag addr:$src2), XMM0))]>, + (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, - int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; +defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, - int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, - int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; +defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; +defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, + X86Blendv, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", @@ -6794,20 +6322,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; let Predicates = [UseSSE41] in { - def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), - (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; } @@ -7451,17 +6970,6 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; -let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), - (VBROADCASTI128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), - (VBROADCASTI128 addr:$src)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF128 addr:$src)>; @@ -7469,7 +6977,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), (VBROADCASTF128 addr:$src)>; } -let Predicates = [HasAVX1Only] in { +// NOTE: We're using FP instructions here, but execution domain fixing can +// convert to integer when profitable. +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTF128 addr:$src)>; def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), @@ -7765,12 +7275,10 @@ let Predicates = [HasF16C, NoVLX] in { WriteCvtPS2PHYSt>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), - (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 + (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt @@ -7835,6 +7343,7 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, (commuteXForm imm:$src3))>; } +let Predicates = [HasAVX2] in { defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, SchedWriteBlend.XMM, VR128, i128mem, BlendCommuteImm4>; @@ -7842,28 +7351,26 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; -// For insertion into the zero index (low half) of a 256-bit vector, it is -// more efficient to generate a blend with immediate instead of an insert*128. -let Predicates = [HasAVX2] in { -def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; -def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), - (VPBLENDDYrri VR256:$src1, - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), + (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; + +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>; } -let Predicates = [HasAVX1Only] in { +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +// NOTE: We're using FP instructions here, but exeuction domain fixing should +// take care of using integer instructions when profitable. +let Predicates = [HasAVX] in { def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), @@ -7880,6 +7387,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } //===----------------------------------------------------------------------===// @@ -7930,9 +7450,9 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, let Predicates = [HasAVX2, NoVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), @@ -7952,9 +7472,15 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWrm addr:$src)>; def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (extloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; } @@ -8038,7 +7564,7 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDDUPrr VR128:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), (VMOVDDUPrm addr:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), + def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPrm addr:$src)>; } @@ -8236,19 +7762,14 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, ValueType MaskVT, string BlendStr, ValueType ZeroVT> { // masked store - def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), + def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; // masked load - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), - (VT (bitconvert (ZeroVT immAllZerosV))))), + def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), + (VT immAllZerosV))), (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), - (!cast<Instruction>(BlendStr#"rr") - RC:$src0, - (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)), - RC:$mask)>; } let Predicates = [HasAVX] in { defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; @@ -8275,21 +7796,6 @@ let Predicates = [HasAVX2] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. -let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v2i64 VR128:$src), 1)>; -def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v4i32 VR128:$src), 1)>; -def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v8i16 VR128:$src), 1)>; -def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), - (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), - (v16i8 VR128:$src), 1)>; -} - let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), @@ -8299,7 +7805,9 @@ def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), (v4f32 VR128:$src), 1)>; } -let Predicates = [HasAVX1Only] in { +// NOTE: We're using FP instructions here, but execution domain fixing can +// convert to integer when profitable. +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), (v2i64 VR128:$src), 1)>; @@ -8318,7 +7826,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), // Variable Bit Shifts // multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode IntrinNode, ValueType vt128, ValueType vt256> { + ValueType vt128, ValueType vt256> { def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -8347,23 +7855,14 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, (vt256 (load addr:$src2)))))]>, VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, SchedWriteVarVecShift.YMM.ReadAfterFold]>; - - def : Pat<(vt128 (IntrinNode VR128:$src1, VR128:$src2)), - (!cast<Instruction>(NAME#"rr") VR128:$src1, VR128:$src2)>; - def : Pat<(vt128 (IntrinNode VR128:$src1, (load addr:$src2))), - (!cast<Instruction>(NAME#"rm") VR128:$src1, addr:$src2)>; - def : Pat<(vt256 (IntrinNode VR256:$src1, VR256:$src2)), - (!cast<Instruction>(NAME#"Yrr") VR256:$src1, VR256:$src2)>; - def : Pat<(vt256 (IntrinNode VR256:$src1, (load addr:$src2))), - (!cast<Instruction>(NAME#"Yrm") VR256:$src1, addr:$src2)>; } let Predicates = [HasAVX2, NoVLX] in { - defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, X86vshlv, v4i32, v8i32>; - defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, X86vshlv, v2i64, v4i64>, VEX_W; - defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, X86vsrlv, v4i32, v8i32>; - defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, X86vsrlv, v2i64, v4i64>, VEX_W; - defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, X86vsrav, v4i32, v8i32>; + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; } //===----------------------------------------------------------------------===// @@ -8393,7 +7892,7 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, VEX, VEX_L, Sched<[WriteLoad]>; } -let Predicates = [UseAVX2] in { +let Predicates = [HasAVX2] in { let mayLoad = 1, hasSideEffects = 0, Constraints = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td index 2dc6e8b43667..82c8e74156b2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSVM.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td @@ -1,9 +1,8 @@ //===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index 7cd63a6dd820..9d974b716dda 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -1,9 +1,8 @@ //===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,11 +30,11 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (shl GR64:$src1, CL))]>; } // Uses = [CL], SchedRW +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; -let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, @@ -473,17 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>; def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16; + [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>, + OpSize16; def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32; + [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>, + OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>; // Rotate by 1 def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), @@ -586,16 +587,16 @@ def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), // Rotate by 1 def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "ror{b}\t$dst", - [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>; def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "ror{w}\t$dst", - [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16; + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize16; def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "ror{l}\t$dst", - [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32; + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>, OpSize32; def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "ror{q}\t$dst", - [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>; + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>; } // Constraints = "$src = $dst", SchedRW let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in { @@ -634,18 +635,18 @@ def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), // Rotate by 1 def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), "ror{b}\t$dst", - [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>; + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t$dst", - [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>, + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, OpSize16; def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t$dst", - [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>, + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, OpSize32; def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", - [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>, + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, Requires<[In64BitMode]>; } // SchedRW @@ -807,13 +808,54 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, } // Defs = [EFLAGS] +// Use the opposite rotate if allows us to use the rotate by 1 instruction. +def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>; +def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>; +def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>; +def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>; +def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>; +def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>; +def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>; +def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>; + +def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst), + (ROR8m1 addr:$dst)>; +def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst), + (ROR16m1 addr:$dst)>; +def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst), + (ROR32m1 addr:$dst)>; +def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst), + (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>; + +def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst), + (ROL8m1 addr:$dst)>; +def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst), + (ROL16m1 addr:$dst)>; +def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst), + (ROL32m1 addr:$dst)>; +def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst), + (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>; + // Sandy Bridge and newer Intel processors support faster rotates using // SHLD to avoid a partial flag update on the normal rotate instructions. -let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { - def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), - (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; - def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), - (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +// Use a pseudo so that TwoInstructionPass and register allocation will see +// this as unary instruction. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5, + Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteSHDrri], + Constraints = "$src1 = $dst" in { + def SHLDROT32ri : I<0, Pseudo, (outs GR32:$dst), + (ins GR32:$src1, u8imm:$shamt), "", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$shamt)))]>; + def SHLDROT64ri : I<0, Pseudo, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$shamt), "", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$shamt)))]>; + + def SHRDROT32ri : I<0, Pseudo, (outs GR32:$dst), + (ins GR32:$src1, u8imm:$shamt), "", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$shamt)))]>; + def SHRDROT64ri : I<0, Pseudo, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$shamt), "", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$shamt)))]>; } def ROT32L2R_imm8 : SDNodeXForm<imm, [{ @@ -871,19 +913,29 @@ let Predicates = [HasBMI2] in { // Prefer RORX which is non-destructive and doesn't update EFLAGS. let AddedComplexity = 10 in { + def : Pat<(rotr GR32:$src, (i8 imm:$shamt)), + (RORX32ri GR32:$src, imm:$shamt)>; + def : Pat<(rotr GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, imm:$shamt)>; + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; } + def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, imm:$shamt)>; + def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, imm:$shamt)>; + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not - // immedidate shift, i.e. the following code is considered better + // immediate shift, i.e. the following code is considered better // // mov %edi, %esi // shl $imm, %esi diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 35ee00b9e016..7050e1917494 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -1,9 +1,8 @@ //===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,10 +14,10 @@ let SchedRW = [WriteSystem] in { let Defs = [RAX, RDX] in - def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB; +def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", []>, TB; let Defs = [RAX, RCX, RDX] in - def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; +def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; // CPU flow control instructions @@ -411,7 +410,7 @@ let Defs = [EAX, EDX], Uses = [ECX] in def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB; let Defs = [RAX, RDX], Uses = [ECX] in - def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB; +def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", []>, OpSize16, TB; @@ -588,18 +587,13 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in //==-----------------------------------------------------------------------===// // PKU - enable protection key -let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - def WRPKRU : PseudoI<(outs), (ins GR32:$src), - [(int_x86_wrpkru GR32:$src)]>; - def RDPKRU : PseudoI<(outs GR32:$dst), (ins), - [(set GR32:$dst, (int_x86_rdpkru))]>; -} - let SchedRW = [WriteSystem] in { let Defs = [EAX, EDX], Uses = [ECX] in - def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", + [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB; let Uses = [EAX, ECX, EDX] in - def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", + [(X86wrpkru EAX, EDX, ECX)]>, TB; } // SchedRW //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td index 10c6eef78639..fc0da845299f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrTSX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td @@ -1,9 +1,8 @@ //===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td index 06a438ebfcad..37bc4ce2e053 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrVMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td @@ -1,9 +1,8 @@ //===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td index c417dc99b84d..e98843bd3ae3 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -1,9 +1,8 @@ //===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -99,76 +98,6 @@ defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>; defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>; -multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr, - RegisterClass RC, ValueType DstTy, - ValueType SrcTy, SubRegIndex SubIdx> { - def : Pat<(alignedstore (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; - - def : Pat<(store (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; -} - -let Predicates = [HasAVX, NoVLX] in { - defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; - defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; -} - -let Predicates = [HasVLX] in { - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64, - v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32, - v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16, - v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8, - v32i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 128-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64, - v8i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32, - v16i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16, - v32i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8, - v64i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 256-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, - sub_ymm>; - defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, - sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64, - v8i64, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32, - v16i32, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16, - v32i16, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8, - v64i8, sub_ymm>; -} - // If we're inserting into an all zeros vector, just use a plain move which // will zero the upper bits. A post-isel hook will take care of removing // any moves that we can prove are unnecessary. @@ -176,7 +105,7 @@ multiclass subvec_zero_lowering<string MoveStr, RegisterClass RC, ValueType DstTy, ValueType SrcTy, ValueType ZeroTy, SubRegIndex SubIdx> { - def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + def : Pat<(DstTy (insert_subvector immAllZerosV, (SrcTy RC:$src), (iPTR 0))), (SUBREG_TO_REG (i64 0), (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>; @@ -398,7 +327,7 @@ let Predicates = [HasBWI, HasDQI] in { (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>; } -let Predicates = [HasBWI, HasVLX] in { +let Predicates = [HasBWI] in { def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), (v1i1 VK1:$mask), (iPTR 0))), (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32), @@ -487,7 +416,7 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), (XORPSrr VR128:$src1, VR128:$src2)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))), (VANDPSrm VR128:$src1, f128mem:$src2)>; @@ -507,3 +436,24 @@ def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))), def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), (VXORPSrr VR128:$src1, VR128:$src2)>; } + +let Predicates = [HasVLX] in { +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(f128 (X86fand VR128X:$src1, (loadf128 addr:$src2))), + (VANDPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fand VR128X:$src1, VR128X:$src2)), + (VANDPSZ128rr VR128X:$src1, VR128X:$src2)>; + +def : Pat<(f128 (X86for VR128X:$src1, (loadf128 addr:$src2))), + (VORPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86for VR128X:$src1, VR128X:$src2)), + (VORPSZ128rr VR128X:$src1, VR128X:$src2)>; + +def : Pat<(f128 (X86fxor VR128X:$src1, (loadf128 addr:$src2))), + (VXORPSZ128rm VR128X:$src1, f128mem:$src2)>; + +def : Pat<(f128 (X86fxor VR128X:$src1, VR128X:$src2)), + (VXORPSZ128rr VR128X:$src1, VR128X:$src2)>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index 9d810a675e3b..66ca78556b82 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -1,9 +1,8 @@ //===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -247,36 +246,22 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, let ExeDomain = SSEPackedInt in { // SSE integer instructions let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, XOPCC:$cc), - !strconcat("vpcom${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR128:$src1, VR128:$src2, u8imm:$cc), + !strconcat("vpcom", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), imm:$cc)))]>, XOP_4V, Sched<[sched]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), - !strconcat("vpcom${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR128:$src1, i128mem:$src2, u8imm:$cc), + !strconcat("vpcom", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)), imm:$cc)))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - !strconcat("vpcom", Suffix, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable; - let mayLoad = 1 in - def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - !strconcat("vpcom", Suffix, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>, - NotMemoryFoldable; - } } def : Pat<(OpNode (load addr:$src2), diff --git a/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp index c20336387b2d..892a083f4d1a 100644 --- a/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -1,9 +1,8 @@ //===- X86InstructionSelector.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -419,18 +418,22 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, if (X86::GPRRegBankID == RB.getID()) return Isload ? X86::MOV32rm : X86::MOV32mr; if (X86::VECRRegBankID == RB.getID()) - return Isload ? (HasAVX512 ? X86::VMOVSSZrm - : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) - : (HasAVX512 ? X86::VMOVSSZmr - : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + return Isload ? (HasAVX512 ? X86::VMOVSSZrm_alt : + HasAVX ? X86::VMOVSSrm_alt : + X86::MOVSSrm_alt) + : (HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : + X86::MOVSSmr); } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) { if (X86::GPRRegBankID == RB.getID()) return Isload ? X86::MOV64rm : X86::MOV64mr; if (X86::VECRRegBankID == RB.getID()) - return Isload ? (HasAVX512 ? X86::VMOVSDZrm - : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) - : (HasAVX512 ? X86::VMOVSDZmr - : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + return Isload ? (HasAVX512 ? X86::VMOVSDZrm_alt : + HasAVX ? X86::VMOVSDrm_alt : + X86::MOVSDrm_alt) + : (HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : + X86::MOVSDmr); } else if (Ty.isVector() && Ty.getSizeInBits() == 128) { if (Alignment >= 16) return Isload ? (HasVLX ? X86::VMOVAPSZ128rm @@ -513,10 +516,22 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, LLT Ty = MRI.getType(DefReg); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + assert(I.hasOneMemOperand()); auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { - LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); - return false; + if (MemOp.isAtomic()) { + // Note: for unordered operations, we rely on the fact the appropriate MMO + // is already on the instruction we're mutating, and thus we don't need to + // make any changes. So long as we select an opcode which is capable of + // loading or storing the appropriate size atomically, the rest of the + // backend is required to respect the MMO state. + if (!MemOp.isUnordered()) { + LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n"); + return false; + } + if (MemOp.getAlignment() < Ty.getSizeInBits()/8) { + LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n"); + return false; + } } unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); @@ -936,7 +951,6 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode( (CmpInst::Predicate)I.getOperand(1).getPredicate()); - unsigned OpSet = X86::getSETFromCond(CC); unsigned LHS = I.getOperand(2).getReg(); unsigned RHS = I.getOperand(3).getReg(); @@ -970,7 +984,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, .addReg(RHS); MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(OpSet), I.getOperand(0).getReg()); + TII.get(X86::SETCCr), I.getOperand(0).getReg()).addImm(CC); constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI); @@ -991,8 +1005,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static const uint16_t SETFOpcTable[2][3] = { - {X86::SETEr, X86::SETNPr, X86::AND8rr}, - {X86::SETNEr, X86::SETPr, X86::OR8rr}}; + {X86::COND_E, X86::COND_NP, X86::AND8rr}, + {X86::COND_NE, X86::COND_P, X86::OR8rr}}; const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: @@ -1032,9 +1046,9 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SETFOpc[0]), FlagReg1); + TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SETFOpc[1]), FlagReg2); + TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]); MachineInstr &Set3 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SETFOpc[2]), ResultReg) .addReg(FlagReg1) @@ -1052,7 +1066,6 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, bool SwapArgs; std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LhsReg, RhsReg); @@ -1064,7 +1077,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, .addReg(RhsReg); MachineInstr &Set = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc), ResultReg); + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), ResultReg).addImm(CC); constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); constrainSelectedInstRegOperands(Set, TII, TRI, RBI); I.eraseFromParent(); @@ -1409,8 +1422,8 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I, *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri)) .addReg(CondReg) .addImm(1); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1)) - .addMBB(DestMBB); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JCC_1)) + .addMBB(DestMBB).addImm(X86::COND_NE); constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI); @@ -1530,15 +1543,14 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, const static struct ShiftEntry { unsigned SizeInBits; - unsigned CReg; unsigned OpLSHR; unsigned OpASHR; unsigned OpSHL; } OpTable[] = { - {8, X86::CL, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8 - {16, X86::CX, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16 - {32, X86::ECX, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32 - {64, X86::RCX, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64 + {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8 + {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16 + {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32 + {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64 }; if (DstRB.getID() != X86::GPRRegBankID) @@ -1551,7 +1563,6 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, if (ShiftEntryIt == std::end(OpTable)) return false; - unsigned CReg = ShiftEntryIt->CReg; unsigned Opcode = 0; switch (I.getOpcode()) { case TargetOpcode::G_SHL: @@ -1570,16 +1581,11 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, unsigned Op0Reg = I.getOperand(1).getReg(); unsigned Op1Reg = I.getOperand(2).getReg(); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), - ShiftEntryIt->CReg) - .addReg(Op1Reg); + assert(MRI.getType(Op1Reg).getSizeInBits() == 8); - // The shift instruction uses X86::CL. If we defined a super-register - // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. - if (CReg != X86::CL) - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::KILL), - X86::CL) - .addReg(CReg, RegState::Kill); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), + X86::CL) + .addReg(Op1Reg); MachineInstr &ShiftInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg) @@ -1608,8 +1614,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) && "Arguments and return value types must match"); - const RegisterBank &RegRB = *RBI.getRegBank(DstReg, MRI, TRI); - if (RegRB.getID() != X86::GPRRegBankID) + const RegisterBank *RegRB = RBI.getRegBank(DstReg, MRI, TRI); + if (!RegRB || RegRB->getID() != X86::GPRRegBankID) return false; const static unsigned NumTypes = 4; // i8, i16, i32, i64 @@ -1707,7 +1713,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, const DivRemEntry &TypeEntry = *OpEntryIt; const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; - const TargetRegisterClass *RegRC = getRegClass(RegTy, RegRB); + const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB); if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) || !RBI.constrainGenericRegister(Op2Reg, *RegRC, MRI) || !RBI.constrainGenericRegister(DstReg, *RegRC, MRI)) { diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 28940754a203..8f74a8fe041d 100644 --- a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -1,9 +1,8 @@ //===- X86InterleavedAccess.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -194,7 +193,7 @@ void X86InterleavedAccessGroup::decompose( // Decompose the load instruction. LoadInst *LI = cast<LoadInst>(VecInst); - Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *VecBaseTy, *VecBasePtrTy; Value *VecBasePtr; unsigned int NumLoads = NumSubVectors; // In the case of stride 3 with a vector of 32 elements load the information @@ -202,18 +201,22 @@ void X86InterleavedAccessGroup::decompose( // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] unsigned VecLength = DL.getTypeSizeInBits(VecWidth); if (VecLength == 768 || VecLength == 1536) { - Type *VecTran = - VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); - VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); + VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16); + VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); NumLoads = NumSubVectors * (VecLength / 384); - } else + } else { + VecBaseTy = SubVecTy; + VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); + } // Generate N loads of T type. for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. - Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); + Value *NewBasePtr = + Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = - Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); + Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment()); DecomposedVectors.push_back(NewLoad); } } @@ -416,7 +419,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4( } reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16), - NumOfElm, 4, Builder); + NumOfElm, 4, Builder); } // createShuffleStride returns shuffle mask of size N. diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index acb3d48463de..40141d894629 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1,9 +1,8 @@ //===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,21 +19,22 @@ namespace llvm { enum IntrinsicType : uint16_t { + CVTNEPS2BF16_MASK, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, - CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, - INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, - INTR_TYPE_3OP_MASK, - IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, - INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, + CVTPD2PS_MASK, + INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, + INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE, + INTR_TYPE_1OP_MASK, INTR_TYPE_2OP_MASK, + IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_SAE, + INTR_TYPE_SCALAR_MASK_RND, + INTR_TYPE_3OP_SCALAR_MASK_SAE, COMPRESS_EXPAND_IN_REG, - TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2I_MASK, + TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, GATHER_AVX2, + FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2, ROUNDP, ROUNDS }; @@ -64,47 +64,47 @@ struct IntrinsicData { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithChain[] = { - X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0), - X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), @@ -115,30 +115,30 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), - X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), @@ -249,47 +249,47 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, 0, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), - X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm, @@ -298,24 +298,24 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86::VSCATTERPF1QPDm), X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), - X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), - X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), - X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0), X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0), X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), - X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), - X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), - X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0), + X86_INTRINSIC_DATA(rdtsc, RDTSC, X86::RDTSC, 0), + X86_INTRINSIC_DATA(rdtscp, RDTSC, X86::RDTSCP, 0), + X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0), X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), }; @@ -340,9 +340,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), - X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,INTR_TYPE_1OP, X86ISD::VFPROUND, 0), X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvt_ps2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), @@ -369,6 +371,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -413,31 +418,37 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE), X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_RND), - X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_RND), - X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE), + X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_conflict_q_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_2OP, X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_2OP, X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_dbpsadbw_128, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_dbpsadbw_256, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), @@ -448,80 +459,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_kadd_d, INTR_TYPE_2OP, X86ISD::KADD, 0), X86_INTRINSIC_DATA(avx512_kadd_q, INTR_TYPE_2OP, X86ISD::KADD, 0), X86_INTRINSIC_DATA(avx512_kadd_w, INTR_TYPE_2OP, X86ISD::KADD, 0), - X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADDS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADDS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, - X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, - X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), - X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, + X86_INTRINSIC_DATA(avx512_mask_compress, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG, - X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, - X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er - X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, CVTPD2DQ_MASK, X86ISD::CVTP2SI, X86ISD::MCVTP2SI), X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, CVTPD2PS_MASK, X86ISD::VFPROUND, X86ISD::VMFPROUND), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_RND_MASK, - ISD::FP_ROUND, X86ISD::VFPROUND_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, X86ISD::VFPROUND_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, CVTPD2DQ_MASK, X86ISD::CVTP2UI, X86ISD::MCVTP2UI), X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2UI, 0), @@ -539,8 +502,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, - ISD::FP_EXTEND, X86ISD::VFPEXT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK_SAE, + ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, @@ -559,164 +522,116 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTSI2P, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPROUNDS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPEXTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, CVTQQ2PS_MASK, + X86ISD::CVTSI2P, X86ISD::MCVTSI2P), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RND, + X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, CVTPD2DQ_MASK, X86ISD::CVTTP2SI, X86ISD::MCVTTP2SI), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2I_MASK, + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, CVTPD2DQ_MASK, X86ISD::CVTTP2UI, X86ISD::MCVTTP2UI), X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2UI, 0), - X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTUI2P, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), - X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIVS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIVS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG, - X86ISD::EXPAND, 0), - X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG, + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, CVTQQ2PS_MASK, + X86ISD::CVTUI2P, X86ISD::MCVTUI2P), + X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512_mask_expand, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), + X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMM, X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, - X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXPS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXPS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::FGETEXP, X86ISD::FGETEXP_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::FGETEXP, X86ISD::FGETEXP_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK, - X86ISD::VGETMANT, X86ISD::VGETMANT_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, X86ISD::VGETMANT_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK, - X86ISD::VGETMANT, X86ISD::VGETMANT_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK, - X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), - X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK, - X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMAXS, X86ISD::FMAXS_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMAXS, X86ISD::FMAXS_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMINS, X86ISD::FMINS_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK, - X86ISD::FMINS, X86ISD::FMINS_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMULS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMULS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, X86ISD::VGETMANT_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_SAE, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_SAE, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, TRUNCATE_TO_REG, @@ -737,10 +652,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG, @@ -749,10 +660,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { ISD::TRUNCATE, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG, X86ISD::VTRUNCS, X86ISD::VMTRUNCS), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG, @@ -825,62 +732,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), - X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), - X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), - X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), - X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_SAE, X86ISD::VRANGE, X86ISD::VRANGE_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK, - X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK, - X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, X86ISD::SCALEF_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM, - X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, - X86ISD::SCALEF, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUBS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUBS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, X86ISD::SCALEF_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::SCALEFS, X86ISD::SCALEFS_RND), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::SCALEFS, X86ISD::SCALEFS_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::FSQRTS, X86ISD::FSQRTS_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::FSQRTS, X86ISD::FSQRTS_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK, X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK, @@ -893,28 +800,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ, - X86ISD::VFIXUPIMM, 0), + X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ, - X86ISD::VFIXUPIMM, 0), - X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ, - X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ, - X86ISD::VFIXUPIMMS, 0), + X86ISD::VFIXUPIMM, X86ISD::VFIXUPIMM_SAE), + X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMM_MASKZ, + X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), + X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMM_MASKZ, + X86ISD::VFIXUPIMMS, X86ISD::VFIXUPIMMS_SAE), - X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP, X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_max_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512_max_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512_min_pd_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), + X86_INTRINSIC_DATA(avx512_min_ps_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -990,10 +899,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), @@ -1002,14 +911,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), + X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND), @@ -1071,6 +982,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), + // bfloat16 + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), @@ -1111,6 +1032,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_cvtps2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvtsd2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), X86_INTRINSIC_DATA(sse2_cvtsd2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), + X86_INTRINSIC_DATA(sse2_cvtsd2ss, INTR_TYPE_2OP, X86ISD::VFPROUNDS, 0), X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttsd2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), @@ -1123,6 +1045,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1156,8 +1080,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), @@ -1200,14 +1127,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP, X86ISD::GF2P8MULB, 0), - X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), - X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), - X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), diff --git a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp index 4a49fa68dd06..00fb1b573858 100644 --- a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -134,9 +133,15 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { // Shifts and SDIV getActionDefinitionsBuilder( - {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM}) - .legalFor({s8, s16, s32}) - .clampScalar(0, s8, s32); + {G_SDIV, G_SREM, G_UDIV, G_UREM}) + .legalFor({s8, s16, s32}) + .clampScalar(0, s8, s32); + + getActionDefinitionsBuilder( + {G_SHL, G_LSHR, G_ASHR}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) + .clampScalar(0, s8, s32) + .clampScalar(1, s8, s8); } // Control-flow @@ -236,12 +241,19 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { .clampScalar(1, s32, s64) .widenScalarToNextPow2(1); - // Shifts and SDIV + // Divisions getActionDefinitionsBuilder( - {G_SHL, G_LSHR, G_ASHR, G_SDIV, G_SREM, G_UDIV, G_UREM}) + {G_SDIV, G_SREM, G_UDIV, G_UREM}) .legalFor({s8, s16, s32, s64}) .clampScalar(0, s8, s64); + // Shifts + getActionDefinitionsBuilder( + {G_SHL, G_LSHR, G_ASHR}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}, {s64, s8}}) + .clampScalar(0, s8, s64) + .clampScalar(1, s8, s8); + // Merge/Unmerge setAction({G_MERGE_VALUES, s128}, Legal); setAction({G_UNMERGE_VALUES, 1, s128}, Legal); diff --git a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h index 135950a95f84..d21707b9ab9b 100644 --- a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h +++ b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h @@ -1,10 +1,9 @@ //===- X86LegalizerInfo.h ------------------------------------------*- C++ //-*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 2816f8c62bfb..b1fefaa84be4 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1,9 +1,8 @@ //===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86InstComments.h" +#include "MCTargetDesc/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" #include "MCTargetDesc/X86TargetStreamer.h" #include "Utils/X86ShuffleDecode.h" #include "X86AsmPrinter.h" @@ -101,9 +100,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), - EnablePrintSchedInfo && - !(Inst.getFlags() & X86::NO_SCHED_INFO)); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -438,7 +435,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(MaybeMCOp.getValue()); // Handle a few special cases to eliminate operand modifiers. -ReSimplify: switch (OutMI.getOpcode()) { case X86::LEA64_32r: case X86::LEA64r: @@ -554,11 +550,6 @@ ReSimplify: case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode; - case X86::TAILJMPd_CC: - case X86::TAILJMPd64_CC: - Opcode = X86::GetCondBranchFromCond( - static_cast<X86::CondCode>(MI->getOperand(1).getImm())); - goto SetTailJmpOpcode; SetTailJmpOpcode: MCOperand Saved = OutMI.getOperand(0); @@ -568,6 +559,17 @@ ReSimplify: break; } + case X86::TAILJMPd_CC: + case X86::TAILJMPd64_CC: { + MCOperand Saved = OutMI.getOperand(0); + MCOperand Saved2 = OutMI.getOperand(1); + OutMI = MCInst(); + OutMI.setOpcode(X86::JCC_1); + OutMI.addOperand(Saved); + OutMI.addOperand(Saved2); + break; + } + case X86::DEC16r: case X86::DEC32r: case X86::INC16r: @@ -586,19 +588,6 @@ ReSimplify: } break; - // These are pseudo-ops for OR to help with the OR->ADD transformation. We do - // this with an ugly goto in case the resultant OR uses EAX and needs the - // short form. - case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify; - case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify; - case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify; - case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify; - case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify; - case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify; - case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify; - case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; - case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; - // We don't currently select the correct instruction form for instructions // which have a short %eax, etc. form. Handle this by custom lowering, for // now. @@ -694,16 +683,9 @@ ReSimplify: void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI) { - - bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || + bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 || MI.getOpcode() == X86::TLS_base_addr64; - - bool needsPadding = MI.getOpcode() == X86::TLS_addr64; - - MCContext &context = OutStreamer->getContext(); - - if (needsPadding) - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + MCContext &Ctx = OutStreamer->getContext(); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { @@ -721,51 +703,86 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, llvm_unreachable("unexpected opcode"); } - MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); - const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context); - - MCInst LEA; - if (is64Bits) { - LEA.setOpcode(X86::LEA64r); - LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest - LEA.addOperand(MCOperand::createReg(X86::RIP)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(0)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg - } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { - LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::createReg(X86::EBX)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(0)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg + const MCSymbolRefExpr *Sym = MCSymbolRefExpr::create( + MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)), SRVK, Ctx); + + // As of binutils 2.32, ld has a bogus TLS relaxation error when the GD/LD + // code sequence using R_X86_64_GOTPCREL (instead of R_X86_64_GOTPCRELX) is + // attempted to be relaxed to IE/LE (binutils PR24784). Work around the bug by + // only using GOT when GOTPCRELX is enabled. + // TODO Delete the workaround when GOTPCRELX becomes commonplace. + bool UseGot = MMI->getModule()->getRtLibUseGOT() && + Ctx.getAsmInfo()->canRelaxRelocations(); + + if (Is64Bits) { + bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD; + if (NeedsPadding) + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::LEA64r) + .addReg(X86::RDI) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addExpr(Sym) + .addReg(0)); + const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("__tls_get_addr"); + if (NeedsPadding) { + if (!UseGot) + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + } + if (UseGot) { + const MCExpr *Expr = MCSymbolRefExpr::create( + TlsGetAddr, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64m) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addExpr(Expr) + .addReg(0)); + } else { + EmitAndCountInstruction( + MCInstBuilder(X86::CALL64pcrel32) + .addExpr(MCSymbolRefExpr::create(TlsGetAddr, + MCSymbolRefExpr::VK_PLT, Ctx))); + } } else { - LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::createReg(0)); // base - LEA.addOperand(MCOperand::createImm(1)); // scale - LEA.addOperand(MCOperand::createReg(X86::EBX)); // index - LEA.addOperand(MCOperand::createExpr(symRef)); // disp - LEA.addOperand(MCOperand::createReg(0)); // seg - } - EmitAndCountInstruction(LEA); + if (SRVK == MCSymbolRefExpr::VK_TLSGD && !UseGot) { + EmitAndCountInstruction(MCInstBuilder(X86::LEA32r) + .addReg(X86::EAX) + .addReg(0) + .addImm(1) + .addReg(X86::EBX) + .addExpr(Sym) + .addReg(0)); + } else { + EmitAndCountInstruction(MCInstBuilder(X86::LEA32r) + .addReg(X86::EAX) + .addReg(X86::EBX) + .addImm(1) + .addReg(0) + .addExpr(Sym) + .addReg(0)); + } - if (needsPadding) { - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); - EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); - EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + const MCSymbol *TlsGetAddr = Ctx.getOrCreateSymbol("___tls_get_addr"); + if (UseGot) { + const MCExpr *Expr = + MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_GOT, Ctx); + EmitAndCountInstruction(MCInstBuilder(X86::CALL32m) + .addReg(X86::EBX) + .addImm(1) + .addReg(0) + .addExpr(Expr) + .addReg(0)); + } else { + EmitAndCountInstruction( + MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::create(TlsGetAddr, + MCSymbolRefExpr::VK_PLT, Ctx))); + } } - - StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; - MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name); - const MCSymbolRefExpr *tlsRef = - MCSymbolRefExpr::create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, context); - - EmitAndCountInstruction( - MCInstBuilder(is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32) - .addExpr(tlsRef)); } /// Emit the largest nop instruction smaller than or equal to \p NumBytes @@ -778,7 +795,7 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; - Opc = IndexReg = Displacement = SegmentReg = 0; + IndexReg = Displacement = SegmentReg = 0; BaseReg = X86::RAX; ScaleVal = 1; switch (NumBytes) { @@ -963,6 +980,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I)) MI.addOperand(MaybeOperand.getValue()); + OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); OutStreamer->EmitInstruction(MI, getSubtargetInfo()); } @@ -1374,7 +1392,8 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { MBB = MBB->getPrevNode(); MBBI = MBB->end(); } - return --MBBI; + --MBBI; + return MBBI; } static const Constant *getConstantFromPool(const MachineInstr &MI, @@ -1668,6 +1687,77 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TLS_base_addr64: return LowerTlsAddr(MCInstLowering, *MI); + // Loading/storing mask pairs requires two kmov operations. The second one of these + // needs a 2 byte displacement relative to the specified address (with 32 bit spill + // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size, + // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD. + // + // The displacement value might wrap around in theory, thus the asserts in both + // cases. + case X86::MASKPAIR16LOAD: { + int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + const X86RegisterInfo *RI = + MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + + // Load the first mask register + MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm); + MIB.addReg(Reg0); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); + MIB.addOperand(Op.getValue()); + } + EmitAndCountInstruction(MIB); + + // Load the second mask register of the pair + MIB = MCInstBuilder(X86::KMOVWkm); + MIB.addReg(Reg1); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addImm(Disp + 2); + } else { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); + MIB.addOperand(Op.getValue()); + } + } + EmitAndCountInstruction(MIB); + return; + } + + case X86::MASKPAIR16STORE: { + int64_t Disp = MI->getOperand(X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + const X86RegisterInfo *RI = + MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg(); + unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + + // Store the first mask register + MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue()); + MIB.addReg(Reg0); + EmitAndCountInstruction(MIB); + + // Store the second mask register of the pair + MIB = MCInstBuilder(X86::KMOVWmk); + for (int i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addImm(Disp + 2); + } else { + auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i)); + MIB.addOperand(Op.getValue()); + } + } + MIB.addReg(Reg1); + EmitAndCountInstruction(MIB); + return; + } + case X86::MOVPC32r: { // This is a pseudo op for a two instruction sequence with a label, which // looks like: @@ -1861,8 +1951,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 64> Mask; DecodePSHUFBMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -1934,8 +2023,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMILPMask(C, ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -1966,8 +2054,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -1984,8 +2071,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPPERMMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), - !EnablePrintSchedInfo); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -2002,7 +2088,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; if (auto *CF = dyn_cast<ConstantFP>(C)) { CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false); - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } break; @@ -2099,7 +2185,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << "]"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } else if (auto *CV = dyn_cast<ConstantVector>(C)) { CS << "<"; for (int l = 0; l != NumLanes; ++l) { @@ -2111,7 +2197,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } break; @@ -2198,14 +2284,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { printConstant(C, CS); } CS << "]"; - OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + OutStreamer->AddComment(CS.str()); } } MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); - if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment)) - TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO); // Stackmap shadows cannot include branch targets, so we can count the bytes // in a call towards the shadow, but must ensure that the no thread returns diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index 5433033671f3..05f846bfb219 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index e1183bd14796..d7e535598d81 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp index 5c09597d0442..c6da4b09dd60 100644 --- a/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -1,9 +1,8 @@ //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,59 +18,29 @@ using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI); - // Check if this processor supports macro-fusion. - if (!ST.hasMacroFusion()) - return false; +namespace { - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; +// The classification for the first instruction. +enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid }; - unsigned FirstOpcode = FirstMI - ? FirstMI->getOpcode() - : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); +// The classification for the second instruction (jump). +enum class JumpKind { + // JE, JL, JG and variants. + ELG, + // JA, JB and variants. + AB, + // JS, JP, JO and variants. + SPO, + // Not a fusable jump. + Invalid, +}; - switch (SecondOpcode) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } +} // namespace - switch (FirstOpcode) { +static FirstInstrKind classifyFirst(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: - return false; + return FirstInstrKind::Invalid; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: @@ -84,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::TEST16mr: case X86::TEST32mr: case X86::TEST64mr: + return FirstInstrKind::Test; case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: @@ -99,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: - return true; + return FirstInstrKind::And; case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: @@ -119,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::CMP8rm: case X86::CMP8rr: case X86::CMP8mr: + return FirstInstrKind::Cmp; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: @@ -141,8 +112,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD8ri: + case X86::ADD8ri_DB: case X86::ADD8rm: case X86::ADD8rr: + case X86::ADD8rr_DB: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB16rm: @@ -158,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; + return FirstInstrKind::ALU; case X86::INC16r: case X86::INC32r: case X86::INC64r: @@ -167,10 +140,87 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: - return FuseKind == FuseInc; - case X86::INSTRUCTION_LIST_END: - return true; + return FirstInstrKind::IncDec; + } +} + +static JumpKind classifySecond(const MachineInstr &MI) { + X86::CondCode CC = X86::getCondFromBranch(MI); + if (CC == X86::COND_INVALID) + return JumpKind::Invalid; + + switch (CC) { + default: + return JumpKind::Invalid; + case X86::COND_E: + case X86::COND_NE: + case X86::COND_L: + case X86::COND_LE: + case X86::COND_G: + case X86::COND_GE: + return JumpKind::ELG; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_A: + case X86::COND_AE: + return JumpKind::AB; + case X86::COND_S: + case X86::COND_NS: + case X86::COND_P: + case X86::COND_NP: + case X86::COND_O: + case X86::COND_NO: + return JumpKind::SPO; + } +} + +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI); + + // Check if this processor supports any kind of fusion. + if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) + return false; + + const JumpKind BranchKind = classifySecond(SecondMI); + + if (BranchKind == JumpKind::Invalid) + return false; // Second cannot be fused with anything. + + if (FirstMI == nullptr) + return true; // We're only checking whether Second can be fused at all. + + const FirstInstrKind TestKind = classifyFirst(*FirstMI); + + if (ST.hasBranchFusion()) { + // Branch fusion can merge CMP and TEST with all conditional jumps. + return (TestKind == FirstInstrKind::Cmp || + TestKind == FirstInstrKind::Test); + } + + if (ST.hasMacroFusion()) { + // Macro Fusion rules are a bit more complex. See Agner Fog's + // Microarchitecture table 9.2 "Instruction Fusion". + switch (TestKind) { + case FirstInstrKind::Test: + case FirstInstrKind::And: + return true; + case FirstInstrKind::Cmp: + case FirstInstrKind::ALU: + return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB; + case FirstInstrKind::IncDec: + return BranchKind == JumpKind::ELG; + case FirstInstrKind::Invalid: + return false; + } } + + llvm_unreachable("unknown branch fusion type"); } namespace llvm { diff --git a/contrib/llvm/lib/Target/X86/X86MacroFusion.h b/contrib/llvm/lib/Target/X86/X86MacroFusion.h index 97ef1d6d3b61..d4ae54f657a5 100644 --- a/contrib/llvm/lib/Target/X86/X86MacroFusion.h +++ b/contrib/llvm/lib/Target/X86/X86MacroFusion.h @@ -1,9 +1,8 @@ //===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index b56d02b6bfb6..7f75598b0655 100644 --- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -1,9 +1,8 @@ //===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -569,11 +568,8 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, unsigned VReg, int64_t AddrDispShift) { DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression()); - if (AddrDispShift != 0) - Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift, - DIExpression::NoDeref, - DIExpression::WithStackValue); + Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift); // Replace DBG_VALUE instruction with modified version. MachineBasicBlock *MBB = MI.getParent(); @@ -701,7 +697,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().optForSize()) + if (MF.getFunction().hasOptSize()) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 85b9aecc2106..af974c805c36 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -1,9 +1,8 @@ //===-------- X86PadShortFunction.cpp - pad short functions -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -98,7 +97,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - if (MF.getFunction().optForSize()) + if (MF.getFunction().hasOptSize()) return false; if (!MF.getSubtarget<X86Subtarget>().padShortFunctions()) @@ -113,14 +112,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; - MachineBasicBlock *MBB; - unsigned int Cycles = 0; - // Pad the identified basic blocks with NOOPs for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); I != ReturnBBs.end(); ++I) { - MBB = I->first; - Cycles = I->second; + MachineBasicBlock *MBB = I->first; + unsigned Cycles = I->second; if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions diff --git a/contrib/llvm/lib/Target/X86/X86PfmCounters.td b/contrib/llvm/lib/Target/X86/X86PfmCounters.td index a1a4210b5ebf..5610f4bc8873 100644 --- a/contrib/llvm/lib/Target/X86/X86PfmCounters.td +++ b/contrib/llvm/lib/Target/X86/X86PfmCounters.td @@ -1,9 +1,8 @@ //===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp index 355291916ee8..78fede3dcde2 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -160,7 +159,7 @@ const RegisterBankInfo::InstructionMapping & X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto Opc = MI.getOpcode(); + unsigned Opc = MI.getOpcode(); // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. @@ -174,17 +173,22 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: case TargetOpcode::G_MUL: - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: return getSameOperandsMapping(MI, false); - break; case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: return getSameOperandsMapping(MI, true); - break; + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: { + unsigned NumOperands = MI.getNumOperands(); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + auto Mapping = getValueMapping(getPartialMappingIdx(Ty, false), 3); + return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands); + + } default: break; } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h index e227880427f3..c1f3001c6180 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h @@ -1,9 +1,8 @@ //===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBanks.td b/contrib/llvm/lib/Target/X86/X86RegisterBanks.td index 6d17cd53a0c1..74c515850ab1 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterBanks.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterBanks.td @@ -1,9 +1,8 @@ //=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index bc39cee34c4a..2e2f1f9e438a 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -164,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::RFP32RegClassID: case X86::RFP64RegClassID: case X86::RFP80RegClassID: + case X86::VR512_0_15RegClassID: case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. @@ -216,6 +216,21 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, } } +bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // Prevent rewriting a copy where the destination size is larger than the + // input size. See PR41619. + // FIXME: Should this be factored into the base implementation somehow. + if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 && + SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit) + return false; + + return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, + SrcRC, SrcSubReg); +} + const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { const Function &F = MF.getFunction(); @@ -750,7 +765,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } -unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? FramePtr : StackPtr; } @@ -763,3 +778,12 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } + +unsigned +X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + unsigned StackReg = getStackRegister(); + if (Subtarget.isTarget64BitILP32()) + StackReg = getX86SubSuperRegister(StackReg, 32); + return StackReg; +} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 29401dadead0..b82920898069 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -1,9 +1,8 @@ //===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,7 +49,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(const Triple &TT); + explicit X86RegisterInfo(const Triple &TT); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -75,6 +74,11 @@ public: getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. const TargetRegisterClass * @@ -129,15 +133,16 @@ public: RegScavenger *RS = nullptr) const override; // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; - unsigned getStackRegister() const { return StackPtr; } - unsigned getBaseRegister() const { return BasePtr; } + unsigned getPtrSizedStackRegister(const MachineFunction &MF) const; + Register getStackRegister() const { return StackPtr; } + Register getBaseRegister() const { return BasePtr; } /// Returns physical register used as frame pointer. /// This will always returns the frame pointer register, contrary to /// getFrameRegister() which returns the "base pointer" in situations /// involving a stack, frame and base pointer. - unsigned getFramePtr() const { return FramePtr; } + Register getFramePtr() const { return FramePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } }; diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index 6a0538138528..0528b90c1fd5 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -1,9 +1,8 @@ //===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,6 +28,8 @@ let Namespace = "X86" in { def sub_32bit : SubRegIndex<32>; def sub_xmm : SubRegIndex<128>; def sub_ymm : SubRegIndex<256>; + def sub_mask_0 : SubRegIndex<-1>; + def sub_mask_1 : SubRegIndex<-1, -1>; } //===----------------------------------------------------------------------===// @@ -553,17 +554,6 @@ def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128 def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; -// Special classes that help the assembly parser choose some alternate -// instructions to favor 2-byte VEX encodings. -def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (sequence "XMM%u", 0, 7)>; -def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (sequence "XMM%u", 8, 15)>; -def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 0, 7)>; -def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 8, 15)>; - // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. @@ -582,6 +572,10 @@ def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> { def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; +// Represents the lower 16 registers that have VEX/legacy encodable subregs. +def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 15)>; + // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; @@ -602,6 +596,16 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} +// Mask register pairs +def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1], + [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>; + +def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} +def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} + def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} diff --git a/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp index 08994cccb21e..b435b22e8ac7 100644 --- a/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp +++ b/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp @@ -1,9 +1,8 @@ //======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file diff --git a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td index 971a50196e45..7574e4b8f896 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -1,9 +1,8 @@ //=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -82,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 6>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -159,7 +160,6 @@ defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>; def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads. defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move. -defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc. @@ -186,7 +186,7 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>; // Integer shifts and rotates. defm : BWWriteResPair<WriteShift, [BWPort06], 1>; defm : BWWriteResPair<WriteShiftCL, [BWPort06,BWPort0156], 3, [2,1], 3>; -defm : BWWriteResPair<WriteRotate, [BWPort06], 2, [2], 2>; +defm : BWWriteResPair<WriteRotate, [BWPort06], 1, [1], 1>; defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156], 3, [2,1], 3>; // SHLD/SHRD. @@ -732,10 +732,10 @@ def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { } def: InstRW<[BWWriteResGroup20], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri", - "SBB8ri", - "SET(A|BE)r")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> { let Latency = 2; @@ -814,7 +814,6 @@ def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { let ResourceCycles = [1,1,1,1]; } def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>; -def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>; def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> { let Latency = 4; @@ -890,8 +889,7 @@ def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_(FPrST0|FST0r|FrST0)")>; +def: InstRW<[BWWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> { let Latency = 5; @@ -965,6 +963,7 @@ def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> { } def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm, CVTSS2SDrm, VCVTSS2SDrm, + CVTSS2SDrm_Int, VCVTSS2SDrm_Int, VPSLLVQrm, VPSRLVQrm)>; @@ -1103,6 +1102,14 @@ def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def BWWriteResGroup87_1 : SchedWriteRes<[BWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup87_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -1592,4 +1599,140 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Haswell and Broadwell Pipeline" > "Register allocation and +// renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def BWWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def BWWriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[BWWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def BWWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogic]> +]>; +def : InstRW<[BWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def BWWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[BWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def BWWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicX]> +]>; +def : InstRW<[BWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def BWWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicY]> +]>; +def : InstRW<[BWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def BWWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUX]> +]>; +def : InstRW<[BWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def BWWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUY]> +]>; +def : InstRW<[BWWriteVZeroIdiomALUY], (instrs VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr, + VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def BWWritePCMPGTQ : SchedWriteRes<[BWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def BWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [BWWriteZeroLatency]>, + SchedVar<NoSchedPred, [BWWritePCMPGTQ]> +]>; +def : InstRW<[BWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def BWWriteCMOVA_CMOVBErr : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def BWWriteCMOVA_CMOVBErm : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { + let Latency = 7; + let ResourceCycles = [1,1,1]; + let NumMicroOps = 3; +} + +def BWCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [BWWriteCMOVA_CMOVBErr]>, + SchedVar<NoSchedPred, [WriteCMOV]> +]>; + +def BWCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [BWWriteCMOVA_CMOVBErm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[BWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[BWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def BWWriteSETA_SETBEr : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def BWWriteSETA_SETBEm : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,1,1,1]; + let NumMicroOps = 4; +} + +def BWSETA_SETBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [BWWriteSETA_SETBEr]>, + SchedVar<NoSchedPred, [WriteSETCC]> +]>; + +def BWSETA_SETBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [BWWriteSETA_SETBEm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; + +def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 06a32fb0b1cd..284d1567c5c6 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -1,9 +1,8 @@ //=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -87,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -151,7 +152,7 @@ defm : X86WriteRes<WriteXCHG, [HWPort0156], 2, [3], 3>; // Integer shifts and rotates. defm : HWWriteResPair<WriteShift, [HWPort06], 1>; defm : HWWriteResPair<WriteShiftCL, [HWPort06, HWPort0156], 3, [2,1], 3>; -defm : HWWriteResPair<WriteRotate, [HWPort06], 2, [2], 2>; +defm : HWWriteResPair<WriteRotate, [HWPort06], 1, [1], 1>; defm : HWWriteResPair<WriteRotateCL, [HWPort06, HWPort0156], 3, [2,1], 3>; // SHLD/SHRD. @@ -164,7 +165,6 @@ defm : HWWriteResPair<WriteJump, [HWPort06], 1>; defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>; defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move. -defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc. def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> { @@ -1126,7 +1126,6 @@ def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>; -def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>; def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> { let Latency = 7; @@ -1172,7 +1171,6 @@ def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { let ResourceCycles = [1,1,1,1]; } def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>; -def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>; def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { let Latency = 8; @@ -1182,6 +1180,14 @@ def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def HWWriteResGroup46_1 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup46_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { let Latency = 8; let NumMicroOps = 5; @@ -1391,8 +1397,8 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { let ResourceCycles = [1,1,1]; } def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm, - CVTSD2SSrm, - VCVTSD2SSrm)>; + CVTSD2SSrm, CVTSD2SSrm_Int, + VCVTSD2SSrm, VCVTSD2SSrm_Int)>; def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> { let Latency = 9; @@ -1442,8 +1448,7 @@ def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_(FPrST0|FST0r|FrST0)")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> { let Latency = 11; @@ -1847,4 +1852,170 @@ def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm, def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Haswell and Broadwell Pipeline" > "Register allocation and +// renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def HWWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def HWWriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[HWWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def HWWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogic]> +]>; +def : InstRW<[HWWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def HWWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[HWWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def HWWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicX]> +]>; +def : InstRW<[HWWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def HWWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicY]> +]>; +def : InstRW<[HWWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def HWWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUX]> +]>; +def : InstRW<[HWWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def HWWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUY]> +]>; +def : InstRW<[HWWriteVZeroIdiomALUY], (instrs VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr, + VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def HWWritePCMPGTQ : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def HWWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [HWWriteZeroLatency]>, + SchedVar<NoSchedPred, [HWWritePCMPGTQ]> +]>; +def : InstRW<[HWWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// The 0x83 ADC/SBB opcodes have special support for immediate 0 to only require +// a single uop. It does not apply to the GR8 encoding. And only applies to the +// 8-bit immediate since using larger immediate for 0 would be silly. +// Unfortunately, this optimization does not apply to the AX/EAX/RAX short +// encodings we convert to in MCInstLowering so we exclude AX/EAX/RAX here since +// we schedule before that point. +// TODO: Should we disable using the short encodings on these CPUs? +def HWFastADC0 : MCSchedPredicate< + CheckAll<[ + CheckImmOperand<2, 0>, // Second MCOperand is Imm and has value 0. + CheckNot<CheckRegOperand<1, AX>>, // First MCOperand is not register AX + CheckNot<CheckRegOperand<1, EAX>>, // First MCOperand is not register EAX + CheckNot<CheckRegOperand<1, RAX>> // First MCOperand is not register RAX + ]> +>; + +def HWWriteADC0 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def HWWriteADC : SchedWriteVariant<[ + SchedVar<HWFastADC0, [HWWriteADC0]>, + SchedVar<NoSchedPred, [WriteADC]> +]>; + +def : InstRW<[HWWriteADC], (instrs ADC16ri8, ADC32ri8, ADC64ri8, + SBB16ri8, SBB32ri8, SBB64ri8)>; + +// CMOVs that use both Z and C flag require an extra uop. +def HWWriteCMOVA_CMOVBErr : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def HWWriteCMOVA_CMOVBErm : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 8; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def HWCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [HWWriteCMOVA_CMOVBErr]>, + SchedVar<NoSchedPred, [WriteCMOV]> +]>; + +def HWCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [HWWriteCMOVA_CMOVBErm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[HWCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[HWCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def HWWriteSETA_SETBEr : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 2; + let ResourceCycles = [1,1]; + let NumMicroOps = 2; +} + +def HWWriteSETA_SETBEm : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let ResourceCycles = [1,1,1,1]; + let NumMicroOps = 4; +} + +def HWSETA_SETBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [HWWriteSETA_SETBEr]>, + SchedVar<NoSchedPred, [WriteSETCC]> +]>; + +def HWSETA_SETBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [HWWriteSETA_SETBEm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; + +def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedPredicates.td b/contrib/llvm/lib/Target/X86/X86SchedPredicates.td index 1c7f24375f61..41bd776648f7 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedPredicates.td +++ b/contrib/llvm/lib/Target/X86/X86SchedPredicates.td @@ -1,9 +1,8 @@ //===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -61,3 +60,27 @@ def IsThreeOperandsLEABody : // X86GenInstrInfo. def IsThreeOperandsLEAFn : TIIPredicate<"isThreeOperandsLEA", IsThreeOperandsLEABody>; + +// A predicate to check for COND_A and COND_BE CMOVs which have an extra uop +// on recent Intel CPUs. +def IsCMOVArr_Or_CMOVBErr : CheckAny<[ + CheckImmOperand_s<3, "X86::COND_A">, + CheckImmOperand_s<3, "X86::COND_BE"> +]>; + +def IsCMOVArm_Or_CMOVBErm : CheckAny<[ + CheckImmOperand_s<7, "X86::COND_A">, + CheckImmOperand_s<7, "X86::COND_BE"> +]>; + +// A predicate to check for COND_A and COND_BE SETCCs which have an extra uop +// on recent Intel CPUs. +def IsSETAr_Or_SETBEr : CheckAny<[ + CheckImmOperand_s<1, "X86::COND_A">, + CheckImmOperand_s<1, "X86::COND_BE"> +]>; + +def IsSETAm_Or_SETBEm : CheckAny<[ + CheckImmOperand_s<5, "X86::COND_A">, + CheckImmOperand_s<5, "X86::COND_BE"> +]>; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td index 9dbf0976989f..d40bdf728a48 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -1,9 +1,8 @@ //=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -77,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -159,7 +160,6 @@ defm : SBWriteResPair<WriteJump, [SBPort5], 1>; defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>; defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move. -defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move. def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc. def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> { @@ -615,13 +615,6 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr, MMX_PSIGNDrr, MMX_PSIGNWrr)>; -def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SBWriteResGroup9], (instregex "SET(A|BE)r")>; - def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { let Latency = 2; let NumMicroOps = 2; @@ -705,12 +698,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>; -def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { - let Latency = 5; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} - def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> { let Latency = 5; let NumMicroOps = 1; @@ -772,13 +759,6 @@ def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>; -def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>; - def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> { let Latency = 5; let NumMicroOps = 4; @@ -1148,6 +1128,12 @@ def SBWriteFZeroIdiom : SchedWriteVariant<[ def : InstRW<[SBWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr)>; +def SBWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[SBWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + def SBWriteVZeroIdiomLogicX : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>, SchedVar<NoSchedPred, [WriteVecLogicX]> @@ -1166,10 +1152,68 @@ def : InstRW<[SBWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, PCMPGTDrr, VPCMPGTDrr, PCMPGTWrr, VPCMPGTWrr)>; +def SBWritePCMPGTQ : SchedWriteRes<[SBPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + def SBWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SBWriteZeroLatency]>, - SchedVar<NoSchedPred, [SBWriteResGroup30]> + SchedVar<NoSchedPred, [SBWritePCMPGTQ]> ]>; def : InstRW<[SBWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr)>; +// CMOVs that use both Z and C flag require an extra uop. +def SBWriteCMOVA_CMOVBErr : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 3; + let ResourceCycles = [2,1]; + let NumMicroOps = 3; +} + +def SBWriteCMOVA_CMOVBErm : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> { + let Latency = 8; + let ResourceCycles = [1,2,1]; + let NumMicroOps = 4; +} + +def SBCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SBWriteCMOVA_CMOVBErr]>, + SchedVar<NoSchedPred, [WriteCMOV]> +]>; + +def SBCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SBWriteCMOVA_CMOVBErm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[SBCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SBCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SBWriteSETA_SETBEr : SchedWriteRes<[SBPort05]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SBWriteSETA_SETBEm : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SBSETA_SETBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SBWriteSETA_SETBEr]>, + SchedVar<NoSchedPred, [WriteSETCC]> +]>; + +def SBSETA_SETBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SBWriteSETA_SETBEm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; + +def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 2c9eb7516085..8f3e4ae62d53 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -1,9 +1,8 @@ //=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -157,7 +158,6 @@ defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>; def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads. defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move. -defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc. def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> { @@ -183,7 +183,7 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>; // Integer shifts and rotates. defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>; defm : SKLWriteResPair<WriteShiftCL, [SKLPort06], 3, [3], 3>; -defm : SKLWriteResPair<WriteRotate, [SKLPort06], 2, [2], 2>; +defm : SKLWriteResPair<WriteRotate, [SKLPort06], 1, [1], 1>; defm : SKLWriteResPair<WriteRotateCL, [SKLPort06], 3, [3], 3>; // SHLD/SHRD. @@ -659,8 +659,7 @@ def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr", - "VPBLENDD(Y?)rri", - "(V?)PSUB(B|D|Q|W)(Y?)rr")>; + "VPBLENDD(Y?)rri")>; def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> { let Latency = 1; @@ -698,13 +697,6 @@ def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> { def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP, MMX_MOVDQ2Qrr)>; -def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKLWriteResGroup15], (instregex "SET(A|BE)r")>; - def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -735,9 +727,10 @@ def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { } def: InstRW<[SKLWriteResGroup23], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri", - "SBB8ri")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { let Latency = 2; @@ -776,8 +769,7 @@ def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)", - "VPBROADCAST(B|W)rr", - "(V?)PCMPGTQ(Y?)rr")>; + "VPBROADCAST(B|W)rr")>; def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> { let Latency = 3; @@ -839,13 +831,6 @@ def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { } def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>; -def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>; - def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> { let Latency = 3; let NumMicroOps = 4; @@ -1183,6 +1168,14 @@ def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06 def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def SKLWriteResGroup100_1 : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup100_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -1747,4 +1740,150 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Skylake Pipeline" > "Register allocation and renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def SKLWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def SKLWriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[SKLWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def SKLWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogic]> +]>; +def : InstRW<[SKLWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, + VXORPDrr)>; + +def SKLWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[SKLWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr)>; + +def SKLWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicX]> +]>; +def : InstRW<[SKLWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr)>; + +def SKLWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicY]> +]>; +def : InstRW<[SKLWriteVZeroIdiomLogicY], (instrs VPXORYrr)>; + +def SKLWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUX]> +]>; +def : InstRW<[SKLWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def SKLWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUY]> +]>; +def : InstRW<[SKLWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def SKLWritePSUB : SchedWriteRes<[SKLPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKLWriteVZeroIdiomPSUB : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [SKLWritePSUB]> +]>; +def : InstRW<[SKLWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + VPSUBBYrr, + VPSUBDYrr, + VPSUBQYrr, + VPSUBWYrr)>; + +def SKLWritePCMPGTQ : SchedWriteRes<[SKLPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKLWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKLWriteZeroLatency]>, + SchedVar<NoSchedPred, [SKLWritePCMPGTQ]> +]>; +def : InstRW<[SKLWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def SKLWriteCMOVA_CMOVBErr : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKLWriteCMOVA_CMOVBErm : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 7; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def SKLCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKLWriteCMOVA_CMOVBErr]>, + SchedVar<NoSchedPred, [WriteCMOV]> +]>; + +def SKLCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKLWriteCMOVA_CMOVBErm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[SKLCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SKLCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SKLWriteSETA_SETBEr : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKLWriteSETA_SETBEm : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SKLSETA_SETBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKLWriteSETA_SETBEr]>, + SchedVar<NoSchedPred, [WriteSETCC]> +]>; + +def SKLSETA_SETBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKLWriteSETA_SETBEm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; + +def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td index ec8e4db02d8a..58caf1dacfcb 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1,9 +1,8 @@ //=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -81,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 6>; def : ReadAdvance<ReadAfterVecYLd, 7>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -158,7 +159,6 @@ defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>; def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads. defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move. -defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc. def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> { @@ -176,7 +176,7 @@ defm : X86WriteRes<WriteBitTestSetRegLd, [SKXPort0156,SKXPort23], 5, [1,1], 2>; // Integer shifts and rotates. defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>; defm : SKXWriteResPair<WriteShiftCL, [SKXPort06], 3, [3], 3>; -defm : SKXWriteResPair<WriteRotate, [SKXPort06], 2, [2], 2>; +defm : SKXWriteResPair<WriteRotate, [SKXPort06], 1, [1], 1>; defm : SKXWriteResPair<WriteRotateCL, [SKXPort06], 3, [3], 3>; // SHLD/SHRD. @@ -680,8 +680,7 @@ def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr", "VPBLENDMD(Z128|Z256)rr", "VPBLENDMQ(Z128|Z256)rr", "VPBLENDMW(Z128|Z256)rr", - "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr", - "(V?)PSUB(B|D|Q|W)rr", + "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk", "VPTERNLOGD(Z|Z128|Z256)rri", "VPTERNLOGQ(Z|Z128|Z256)rri")>; @@ -722,13 +721,6 @@ def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> { def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP, MMX_MOVDQ2Qrr)>; -def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKXWriteResGroup15], (instregex "SET(A|BE)r")>; - def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -759,9 +751,10 @@ def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { } def: InstRW<[SKXWriteResGroup23], (instrs CWD, JCXZ, JECXZ, JRCXZ, - ADC8i8, SBB8i8)>; -def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri", - "SBB8ri")>; + ADC8i8, SBB8i8, + ADC16i16, SBB16i16, + ADC32i32, SBB32i32, + ADC64i32, SBB64i32)>; def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> { let Latency = 2; @@ -834,7 +827,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0 "VPCMPD(Z|Z128|Z256)rri", "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr", "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr", - "(V?)PCMPGTQ(Y?)rr", "VPCMPQ(Z|Z128|Z256)rri", "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri", "VPCMPW(Z|Z128|Z256)rri", @@ -900,13 +892,6 @@ def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> { } def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>; -def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { - let Latency = 3; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>; - def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> { let Latency = 3; let NumMicroOps = 4; @@ -1446,6 +1431,14 @@ def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06 def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)", "ROR(8|16|32|64)m(1|i)")>; +def SKXWriteResGroup107_1 : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1, + ROR8r1, ROR16r1, ROR32r1, ROR64r1)>; + def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { let Latency = 7; let NumMicroOps = 5; @@ -2463,4 +2456,171 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>; def: InstRW<[WriteZero], (instrs CLC)>; + +// Intruction variants handled by the renamer. These might not need execution +// ports in certain conditions. +// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// section "Skylake Pipeline" > "Register allocation and renaming". +// These can be investigated with llvm-exegesis, e.g. +// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=- +// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=- + +def SKXWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def SKXWriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[SKXWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def SKXWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogic]> +]>; +def : InstRW<[SKXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, + XORPDrr, VXORPDrr, + VXORPSZ128rr, + VXORPDZ128rr)>; + +def SKXWriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[SKXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, + VXORPSZ256rr, VXORPDZ256rr)>; + +def SKXWriteFZeroIdiomZ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicZ]> +]>; +def : InstRW<[SKXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>; + +def SKXWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicX]> +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, + VPXORDZ128rr, VPXORQZ128rr)>; + +def SKXWriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicY]> +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicY], (instrs VPXORYrr, + VPXORDZ256rr, VPXORQZ256rr)>; + +def SKXWriteVZeroIdiomLogicZ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicZ]> +]>; +def : InstRW<[SKXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>; + +def SKXWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUX]> +]>; +def : InstRW<[SKXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; + +def SKXWriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUY]> +]>; +def : InstRW<[SKXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr, + VPCMPGTDYrr, + VPCMPGTWYrr)>; + +def SKXWritePSUB : SchedWriteRes<[SKXPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKXWriteVZeroIdiomPSUB : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [SKXWritePSUB]> +]>; + +def : InstRW<[SKXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr, + PSUBDrr, VPSUBDrr, VPSUBDZ128rr, + PSUBQrr, VPSUBQrr, VPSUBQZ128rr, + PSUBWrr, VPSUBWrr, VPSUBWZ128rr, + VPSUBBYrr, VPSUBBZ256rr, + VPSUBDYrr, VPSUBDZ256rr, + VPSUBQYrr, VPSUBQZ256rr, + VPSUBWYrr, VPSUBWZ256rr, + VPSUBBZrr, + VPSUBDZrr, + VPSUBQZrr, + VPSUBWZrr)>; +def SKXWritePCMPGTQ : SchedWriteRes<[SKXPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [SKXWriteZeroLatency]>, + SchedVar<NoSchedPred, [SKXWritePCMPGTQ]> +]>; +def : InstRW<[SKXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr, + VPCMPGTQYrr)>; + + +// CMOVs that use both Z and C flag require an extra uop. +def SKXWriteCMOVA_CMOVBErr : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKXWriteCMOVA_CMOVBErm : SchedWriteRes<[SKXPort23,SKXPort06]> { + let Latency = 7; + let ResourceCycles = [1,2]; + let NumMicroOps = 3; +} + +def SKXCMOVA_CMOVBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [SKXWriteCMOVA_CMOVBErr]>, + SchedVar<NoSchedPred, [WriteCMOV]> +]>; + +def SKXCMOVA_CMOVBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [SKXWriteCMOVA_CMOVBErm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[SKXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>; +def : InstRW<[SKXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; + +// SETCCs that use both Z and C flag require an extra uop. +def SKXWriteSETA_SETBEr : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def SKXWriteSETA_SETBEm : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { + let Latency = 3; + let ResourceCycles = [1,1,2]; + let NumMicroOps = 4; +} + +def SKXSETA_SETBErr : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [SKXWriteSETA_SETBEr]>, + SchedVar<NoSchedPred, [WriteSETCC]> +]>; + +def SKXSETA_SETBErm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [SKXWriteSETA_SETBEm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; + +def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>; +def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td index 25aa83f96d3a..55ca85ec1e3d 100644 --- a/contrib/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -1,9 +1,8 @@ //===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -18,6 +17,12 @@ def ReadAfterVecLd : SchedRead; def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; @@ -158,7 +163,6 @@ defm WritePOPCNT : X86SchedWritePair; // Bit population count. defm WriteLZCNT : X86SchedWritePair; // Leading zero count. defm WriteTZCNT : X86SchedWritePair; // Trailing zero count. defm WriteCMOV : X86SchedWritePair; // Conditional move. -defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move. def WriteFCMOV : SchedWrite; // X87 conditional move. def WriteSETCC : SchedWrite; // Set register based on condition code. def WriteSETCCStore : SchedWrite; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td index 1589ff2ef402..b0334655de7e 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -1,9 +1,8 @@ //===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -47,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -112,7 +113,6 @@ defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[1 defm : X86WriteResPairUnsupported<WriteCRC32>; defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>; -defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>; defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [AtomPort01]>; @@ -740,7 +740,7 @@ def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> { let Latency = 45; let ResourceCycles = [45]; } -def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>; +def : InstRW<[AtomWrite01_45], (instrs MONITOR32rrr, MONITOR64rrr)>; def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> { let Latency = 46; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBdVer2.td index 5798e1b2671b..8cc01c3acece 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -1,9 +1,8 @@ //=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -209,7 +208,10 @@ multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW, !add(Lat, LoadLat), !if(!and(!empty(Res), !eq(LoadRes, 1)), [], - !listconcat([LoadRes], Res)), + !listconcat([LoadRes], + !if(!empty(Res), + !listsplat(1, !size(ExePorts)), + Res))), !add(UOps, LoadUOps)>; } @@ -218,7 +220,7 @@ multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW, list<int> Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps, - /*LoadLat*/4, /*LoadRes*/1, LoadUOps>; + /*LoadLat*/4, /*LoadRes*/3, LoadUOps>; } multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW, @@ -226,15 +228,15 @@ multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW, list<int> Res = [], int UOps = 1, int LoadUOps = 0> { defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps, - /*LoadLat*/5, /*LoadRes*/1, LoadUOps>; + /*LoadLat*/5, /*LoadRes*/3, LoadUOps>; } multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW, list<ProcResourceKind> ExePorts, int Lat, - list<int> Res, int UOps = 2, + list<int> Res = [], int UOps = 2, int LoadUOps = 0> { defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps, - /*LoadLat*/5, /*LoadRes*/2, LoadUOps>; + /*LoadLat*/5, /*LoadRes*/3, LoadUOps>; } //===----------------------------------------------------------------------===// @@ -251,6 +253,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +// Transfer from int domain to ivec domain incurs additional latency of 8..10cy +// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller +// and Excavator pipeline", "Data delay between different execution domains" +def : ReadAdvance<ReadInt2Fpu, -10>; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes<WriteRMW, [PdStore]>; @@ -258,15 +265,15 @@ def : WriteRes<WriteRMW, [PdStore]>; // Loads, stores, and moves, not folded with other operations. //////////////////////////////////////////////////////////////////////////////// -def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; } +def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; } def : WriteRes<WriteStore, [PdStore]>; def : WriteRes<WriteStoreNT, [PdStore]>; -def : WriteRes<WriteMove, [PdEX01]>; +def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; } // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; } -def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; } +def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; } // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; @@ -300,6 +307,7 @@ def : InstRW<[PdWriteXLAT], (instrs XLAT)>; def PdWriteLARrr : SchedWriteRes<[PdEX01]> { let Latency = 184; + let ResourceCycles = [375]; let NumMicroOps = 45; } def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", @@ -307,22 +315,31 @@ def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", // Nops don't have dependencies, so there's no actual latency, but we set this // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. -def : WriteRes<WriteNop, [PdEX01]>; +def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; } //////////////////////////////////////////////////////////////////////////////// // Arithmetic. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResExPair<WriteALU, [PdEX01]>; +defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>; + +def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> { + let Latency = 6; + let ResourceCycles = [3, 2, 1]; + let NumMicroOps = 1; +} +def : SchedAlias<WriteALURMW, PdWriteALURMW>; def PdWriteLXADD : SchedWriteRes<[PdEX01]> { let Latency = 6; + let ResourceCycles = [88]; let NumMicroOps = 4; } def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>; def PdWriteBMI1 : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [2]; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1], @@ -332,8 +349,9 @@ def : InstRW<[PdWriteBMI1], BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr, TZMSK32rr, TZMSK64rr)>; -def PdWriteBMI1m : SchedWriteRes<[PdEX01]> { +def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> { let Latency = 6; + let ResourceCycles = [3, 3]; let NumMicroOps = 2; } def : InstRW<[PdWriteBMI1m], @@ -345,26 +363,34 @@ def : InstRW<[PdWriteBMI1m], defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>; -defm : PdWriteRes<WriteBSWAP32, [PdEX1]>; -defm : PdWriteRes<WriteBSWAP64, [PdEX1]>; -defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [], 5>; -defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [], 2>; -defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>; +def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> { + let ResourceCycles = [3]; +} +def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>; + +defm : PdWriteRes<WriteBSWAP32, [PdEX01]>; +defm : PdWriteRes<WriteBSWAP64, [PdEX01]>; +defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>; +defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>; +defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>; def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [3]; let NumMicroOps = 3; } def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [23]; let NumMicroOps = 5; } def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>; def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [21]; let NumMicroOps = 6; } def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm], @@ -372,42 +398,40 @@ def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm], def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [26]; let NumMicroOps = 18; } def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>; def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> { let Latency = 3; + let ResourceCycles = [69]; let NumMicroOps = 22; } def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>; -def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> { - let Latency = 2; - let NumMicroOps = 2; -} -def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>; - def PdWriteXADD : SchedWriteRes<[PdEX1]> { - let Latency = 2; - let NumMicroOps = 4; + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; } def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>; def PdWriteXADDm : SchedWriteRes<[PdEX1]> { -let Latency = 6; -let NumMicroOps = 4; + let Latency = 6; + let ResourceCycles = [20]; + let NumMicroOps = 4; } def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>; -defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4>; -defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [], 2>; -defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [], 2>; -defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4>; -defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4>; -defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [], 1, 1>; -defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4>; -defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 4]>; +defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>; +defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>; +defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>; +defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>; +defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>; +defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>; +defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>; +defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>; defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>; defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>; defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX @@ -422,36 +446,48 @@ defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17], defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>; defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>; -defm : PdWriteResExPair<WriteCRC32, [PdEX01], 3, [4], 3>; +defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>; def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> { let Latency = 5; - let ResourceCycles = [4]; + let ResourceCycles = [10]; let NumMicroOps = 5; } def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>; def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> { let Latency = 6; - let ResourceCycles = [4]; + let ResourceCycles = [12]; let NumMicroOps = 7; } def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>; def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> { let Latency = 10; - let ResourceCycles = [4]; + let ResourceCycles = [17]; let NumMicroOps = 11; } def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>; defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move. -defm : PdWriteResExPair<WriteCMOV2, [PdEX01], 1, [], 1, 1>; // Conditional (CF + ZF flag) move. -def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm, - CMOVGE16rm, CMOVGE32rm, CMOVGE64rm, - CMOVL16rm, CMOVL32rm, CMOVL64rm, - CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>; +def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> { + let Latency = 5; + let ResourceCycles = [3, 3]; + let NumMicroOps = 2; +} + +def PdWriteCMOVmVar : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>, + SchedVar<NoSchedPred, [WriteCMOV.Folded]> +]>; + +def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>; defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move. @@ -462,107 +498,143 @@ def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> { let ResourceCycles = [2]; let NumMicroOps = 2; } -def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm, - SETLEm, SETLm)>; -defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [], 2>; +def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>, + SchedVar<NoSchedPred, [WriteSETCCStore]> +]>; +def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>; + +defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>; -def WriteLAHF : SchedWriteRes<[PdEX01]> { +def PdWriteLAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [4]; let NumMicroOps = 4; } -def : InstRW<[WriteLAHF], (instrs LAHF)>; +def : InstRW<[PdWriteLAHF], (instrs LAHF)>; -def WriteSAHF : SchedWriteRes<[PdEX01]> { +def PdWriteSAHF : SchedWriteRes<[PdEX01]> { let Latency = 2; + let ResourceCycles = [2]; let NumMicroOps = 2; } -def : InstRW<[WriteSAHF], (instrs SAHF)>; +def : InstRW<[PdWriteSAHF], (instrs SAHF)>; + +defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>; +defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>; +defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>; +defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>; +defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>; +defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>; -defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [1], 1>; -defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [1, 1], 1>; -defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [1, 1], 7>; -defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [1], 2>; -defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>; -defm : PdWriteRes<WriteBitTestSetImmRMW, [PdEX01, PdLoad], 6, [1, 1], 4>; -defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>; -defm : PdWriteRes<WriteBitTestSetRegRMW, [PdEX01, PdLoad], 6, [1, 1], 10>; +def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> { + let Latency = 7; + let ResourceCycles = [42, 1]; + let NumMicroOps = 4; +} +def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>; +def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> { + let Latency = 7; + let ResourceCycles = [44, 1]; + let NumMicroOps = 10; +} +def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>; // This is for simple LEAs with one or two input operands. // FIXME: SAGU 3-operand LEA def : WriteRes<WriteLEA, [PdEX01]> { let NumMicroOps = 2; } // Bit counts. -defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [4], 6, 2>; -defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [4], 7, 2>; -defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4>; -defm : PdWriteResExPair<WriteLZCNT, [PdEX01], 2, [], 2>; -defm : PdWriteResExPair<WriteTZCNT, [PdEX01], 2, [2], 2>; +defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>; +defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>; +defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>; +defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>; +defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>; // BMI1 BEXTR, BMI2 BZHI -defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [], 2>; -defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [], 2>; +defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>; +defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>; defm : PdWriteResExPair<WriteBZHI, [PdEX01]>; +def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let ResourceCycles = [4]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>; + +def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let ResourceCycles = [5]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>; + //////////////////////////////////////////////////////////////////////////////// // Integer shifts and rotates. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResExPair<WriteShift, [PdEX01]>; +defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>; defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>; -defm : PdWriteResExPair<WriteRotate, [PdEX01]>; +defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>; defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>; def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> { let Latency = 12; + let ResourceCycles = [24]; let NumMicroOps = 26; } def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>; def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> { let Latency = 12; + let ResourceCycles = [23]; let NumMicroOps = 23; } def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>; def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> { let Latency = 11; + let ResourceCycles = [22]; let NumMicroOps = 24; } def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>; def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> { let Latency = 10; + let ResourceCycles = [20]; let NumMicroOps = 22; } def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>; def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> { let Latency = 10; + let ResourceCycles = [19]; let NumMicroOps = 19; } def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>; -def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> { +def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [14]; let NumMicroOps = 17; } -def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>; +def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>; -def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> { +def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [13]; let NumMicroOps = 16; } -def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>; - -def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> { - let Latency = 7; - let NumMicroOps = 16; -} -def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>; +def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>; def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> { let Latency = 7; + let ResourceCycles = [14]; let NumMicroOps = 15; } def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; @@ -570,31 +642,35 @@ def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> { let Latency = 9; + let ResourceCycles = [18]; let NumMicroOps = 20; } def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>; def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> { let Latency = 11; + let ResourceCycles = [21]; let NumMicroOps = 21; } def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>; def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> { let Latency = 8; + let ResourceCycles = [15]; let NumMicroOps = 16; } def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>; def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> { let Latency = 13; + let ResourceCycles = [25]; let NumMicroOps = 25; } def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>; // SHLD/SHRD. -defm : PdWriteRes<WriteSHDrri, [PdEX01], 4, [6], 6>; -defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 4, [8], 7>; +defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>; +defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>; def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { let Latency = 3; @@ -604,8 +680,8 @@ def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>; def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> { - let Latency = 4; - let ResourceCycles = [8]; + let Latency = 3; + let ResourceCycles = [6]; let NumMicroOps = 7; } def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL, @@ -623,19 +699,20 @@ defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>; -defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5>; -defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5>; -defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [], 2>; +defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>; +defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>; +defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>; -defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [1, 1, 2]>; -defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [2, 2, 4], 2>; +defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>; +defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>; -defm : PdWriteRes<WriteFStore, [PdStore, PdFPU1, PdFPSTO], 2>; -defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU1, PdFPSTO]>; -defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>; +defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>; +defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>; +defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>; -def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { +def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> { let Latency = 2; + let ResourceCycles = [1, 3, 1]; let NumMicroOps = 2; } def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>; @@ -649,33 +726,41 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>; -defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 4], 18>; -defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 4], 34>; +defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; +defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>; -defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA]>; +defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>; defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>; defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>; defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>; defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [2, 1]>; +defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFAddZ>; +def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> { + let Latency = 5; + let ResourceCycles = [3, 1, 10]; +} +def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m, + SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m, + SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>; + defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>; defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [2, 1]>; +defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFAdd64Z>; defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>; defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>; -defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [2, 1]>; +defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFCmpZ>; defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>; defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>; -defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [2, 1]>; +defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>; @@ -690,29 +775,35 @@ def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>; defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>; defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [2, 1]>; +defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFMulZ>; +def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> { + let Latency = 5; + let ResourceCycles = [3, 1, 10]; +} +def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>; + defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>; defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [2, 1]>; +defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFMul64Z>; -defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5>; -defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 1]>; +defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>; +defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>; +defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>; defm : X86WriteResPairUnsupported<WriteFMAZ>; -defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 3], 15, 2>; +defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>; -defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 3], 16, 2>; -defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 6], /*or 29*/ 25, 4>; +defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>; +defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>; defm : X86WriteResPairUnsupported<WriteDPPSZ>; def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> { - let Latency = 25; - let ResourceCycles = [1, 3]; + let Latency = 27; + let ResourceCycles = [1, 14]; let NumMicroOps = 17; } def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>; @@ -722,118 +813,140 @@ defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>; defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>; defm : X86WriteResPairUnsupported<WriteFRcpZ>; -defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5>; +defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>; defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>; -defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 1]>; +defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>; defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; -defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 19]>; -defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 19]>; -defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 38]>; +defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>; defm : X86WriteResPairUnsupported<WriteFDivZ>; -defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 19]>; -defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 19]>; -defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 38]>; +def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> { + let Latency = 9; + let ResourceCycles = [3, 1, 18]; +} +def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m, + DIVR_FI16m, DIVR_FI32m, + DIV_F32m, DIV_F64m, + DIVR_F32m, DIVR_F64m)>; + +defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>; defm : X86WriteResPairUnsupported<WriteFDiv64Z>; -defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 21]>; -defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 21]>; -defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 42]>; +defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>; defm : X86WriteResPairUnsupported<WriteFSqrtZ>; -defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 27]>; -defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 27]>; -defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 54]>; +defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>; +defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>; defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; -defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 35]>; -defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA]>; +defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>; +defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>; -defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4>; +defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>; defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>; defm : X86WriteResPairUnsupported<WriteFRndZ>; -def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> { + let Latency = 10; + let ResourceCycles = [2, 1]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>; + +def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; + let ResourceCycles = [10, 1]; let NumMicroOps = 2; } -def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr, - VFRCZSDrr, VFRCZSSrr)>; +def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>; def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; - let NumMicroOps = 2; + let ResourceCycles = [2, 1]; + let NumMicroOps = 3; } def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm, VFRCZSDrm, VFRCZSSrm)>; def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 10; - let ResourceCycles = [2, 1]; + let ResourceCycles = [3, 1]; let NumMicroOps = 4; } def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>; def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> { let Latency = 15; - let ResourceCycles = [2, 1]; + let ResourceCycles = [4, 1]; let NumMicroOps = 8; } def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>; -defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2>; +defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2, [1, 2]>; defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU01, PdFPFMA], 2, [2, 2]>; defm : X86WriteResPairUnsupported<WriteFLogicZ>; defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>; -defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>; +defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>; defm : X86WriteResPairUnsupported<WriteFTestZ>; -defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2>; -defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>; +defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>; +defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>; defm : X86WriteResPairUnsupported<WriteFShuffleZ>; def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; + let ResourceCycles = [1, 3]; let NumMicroOps = 2; } def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>; -defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 4]>; -defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 6], 2>; +defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 2]>; +defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 4], 2>; defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; -defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2>; -defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>; +defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>; +defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 3], 2>; defm : X86WriteResPairUnsupported<WriteFBlendZ>; -defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 4]>; -defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 6], 2>; +defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>; +defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>; defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; -defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [], 2>; +defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>; defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 2; + let ResourceCycles = [1, 2]; } def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>; def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 7; + let ResourceCycles = [1, 4]; let NumMicroOps = 2; } def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>; def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 4; + let ResourceCycles = [1, 6]; let NumMicroOps = 8; } def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>; def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> { let Latency = 8; // 4 + 4 + let ResourceCycles = [1, 8]; let NumMicroOps = 10; } def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; @@ -842,99 +955,100 @@ def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; // Conversions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>; +defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>; -defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU1, PdFPSTO], 4>; -defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU1, PdFPSTO], 4, [2, 1]>; +defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>; +defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>; defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; -defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>; +defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>; -defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU1, PdFPSTO], 8, [], 2>; -defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>; +defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>; +defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>; defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; -def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>; // FIXME: f+3 ST, LD+STC latency -defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU1, PdFPSTO], 4, [], 2>; +defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>; // FIXME: .Folded version is one NumMicroOp *less*.. -defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU1, PdFPSTO], 4>; -defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU1, PdFPSTO], 4, [2, 1]>; +defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>; +defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>; defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; -defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU1, PdFPSTO], 4, [], 2>; +defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>; // FIXME: .Folded version is one NumMicroOp *less*.. -def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 13; + let ResourceCycles = [1, 3, 1]; let NumMicroOps = 2; } -def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>; +def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>; -defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU1, PdFPSTO], 8, [], 2>; -defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>; +defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>; +defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>; defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; -defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU1, PdFPSTO], 4>; +defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>; -defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU1, PdFPSTO], 8, [], 2>; -defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>; +defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>; +defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>; defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; -defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU1, PdFPSTO], 4>; +defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>; -defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU1, PdFPSTO], 8, [], 2>; -defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>; +defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>; +defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>; defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; -def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } -def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, +def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, MMX_CVTPI2PDirr)>; -def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> { +def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 4; let NumMicroOps = 2; } -def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; +def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; -defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU1, PdFPSTO], 8, [], 2, 1>; -defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 3>; +defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>; +defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>; defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; -defm : PdWriteRes<WriteCvtPS2PH, [PdFPU1, PdFPSTO], 8, [], 2>; -defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>; +defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>; +defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>; defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; -defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU1, PdFPSTO, PdStore], 4, [], 3>; -defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU1, PdFPSTO, PdFPFMA, PdStore], 4, [2, 1, 1, 1], 4>; +defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>; +defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>; defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; //////////////////////////////////////////////////////////////////////////////// // Vector integer operations. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5>; -defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5>; -defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [], 2>; +defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>; +defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>; +defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>; -defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5>; -defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5>; +defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>; +defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>; -defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [1, 1, 2]>; -defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>; +defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>; +defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>; -defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU1, PdFPSTO], 2>; -defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU1, PdFPSTO]>; -defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU1, PdFPSTO], 1, [], 4>; +defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>; +defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>; +defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>; def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { let NumMicroOps = 8; @@ -948,24 +1062,33 @@ defm : PdWriteRes<WriteVecMaskedStore, [PdStore, PdFPU01, PdFPMAL], 6, [1, defm : PdWriteRes<WriteVecMaskedStoreY, [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>; defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 2>; +defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>; defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>; -defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 10>; -defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 10, [], 2>; +def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> { +} +def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>; + +def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 4; +} +def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>; + +defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>; +defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>; defm : PdWriteResXMMPair<WriteVecALU, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2>; +defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVecALUY>; defm : X86WriteResPairUnsupported<WriteVecALUZ>; -defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3>; -defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3>; +defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>; +defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVecShiftY>; defm : X86WriteResPairUnsupported<WriteVecShiftZ>; -defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2>; +defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2, [1, 2]>; +defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; @@ -978,55 +1101,67 @@ defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL] defm : X86WriteResPairUnsupported<WritePMULLDY>; defm : X86WriteResPairUnsupported<WritePMULLDZ>; -def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> { +def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> { let Latency = 4; - let ResourceCycles = [2, 1, 2, 1]; } -def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, - VPMACSSDQLrr)>; +def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, + VPMACSSDQLrr)>; -defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 2], 9>; +defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 4], 8>; defm : X86WriteResPairUnsupported<WriteMPSADY>; defm : X86WriteResPairUnsupported<WriteMPSADZ>; -defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [], 2>; -defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [], 2>; +def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> { + let Latency = 8; + let ResourceCycles = [1, 4]; + let NumMicroOps = 10; +} +def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>; + +defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>; +defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>; defm : X86WriteResPairUnsupported<WritePSADBWY>; defm : X86WriteResPairUnsupported<WritePSADBWZ>; defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>; -defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 1]>; +defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2, [1, 2]>; +defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2, [1, 2]>; +defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 4]>; defm : X86WriteResPairUnsupported<WriteShuffleZ>; -defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 4]>; -defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 4]>; +defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 2]>; +defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 3]>; defm : X86WriteResPairUnsupported<WriteVarShuffleY>; defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 2; + let ResourceCycles = [1, 3]; +} +def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>; + defm : PdWriteResXMMPair<WriteBlend, [PdFPU01, PdFPMAL], 2>; defm : X86WriteResPairUnsupported<WriteBlendY>; defm : X86WriteResPairUnsupported<WriteBlendZ>; -defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 4]>; +defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVarBlendY>; defm : X86WriteResPairUnsupported<WriteVarBlendZ>; defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU01, PdFPMAL], 2>; -defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2>; +defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVecLogicY>; defm : X86WriteResPairUnsupported<WriteVecLogicZ>; defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>; -defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>; +defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>; defm : X86WriteResPairUnsupported<WriteVecTestZ>; defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>; defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>; -defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3>; +defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>; defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; @@ -1034,14 +1169,15 @@ defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [], 2>; -defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [], 2>; +defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>; +defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>; -defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>; -defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [], 2>; +defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>; +defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>; def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; + let ResourceCycles = [1, 3]; } def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; @@ -1049,19 +1185,19 @@ def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; // SSE42 String instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 14, [1, 2, 1], 7, 1>; -defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 6, [1, 2, 1], 7, 2>; +defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>; +defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>; -defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 15, [1, 2, 6, 4, 1, 1], 27, 1>; -defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 2, 6, 4, 1, 1], 27, 1>; +defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>; +defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>; //////////////////////////////////////////////////////////////////////////////// // MOVMSK Instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>; +defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>; -defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>; +defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>; defm : X86WriteResUnsupported<WriteVecMOVMSKY>; // defm : X86WriteResUnsupported<WriteVecMOVMSKZ>; @@ -1079,12 +1215,12 @@ defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [], 3, 1>; -defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [2, 1], 8, 2>; +defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>; +defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>; defm : X86WriteResPairUnsupported<WriteFHAddZ>; -defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [], 3, 1>; -defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2>; +defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>; +defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WritePHAddY>; defm : X86WriteResPairUnsupported<WritePHAddZ>; @@ -1106,10 +1242,11 @@ def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm, // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// -defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [], 5, 1>; +defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>; def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> { - let Latency = 13; + let Latency = 12; + let ResourceCycles = [1, 7]; let NumMicroOps = 6; } def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; @@ -1120,9 +1257,15 @@ def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> { let Latency = 3; - let ResourceCycles = [1, 4]; + let ResourceCycles = [1, 2]; +} +def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>; + +def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 3; + let ResourceCycles = [1, 3]; } -def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; +def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>; //////////////////////////////////////////////////////////////////////////////// // AVX instructions. diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 33a6b01546d7..2d26232b4132 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -1,9 +1,8 @@ //=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -109,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>; def : ReadAdvance<ReadAfterVecXLd, 5>; def : ReadAdvance<ReadAfterVecYLd, 5>; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance<ReadInt2Fpu, -6>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -174,6 +178,8 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, } } +// Instructions that have local forwarding disabled have an extra +1cy latency. + // A folded store needs a cycle on the SAGU for the store data, // most RMW instructions don't need an extra uop. defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; @@ -215,7 +221,6 @@ defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. -defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move. defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; @@ -262,14 +267,13 @@ defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; // Loads, stores, and moves, not folded with other operations. //////////////////////////////////////////////////////////////////////////////// -def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } def : WriteRes<WriteStore, [JSAGU]>; def : WriteRes<WriteStoreNT, [JSAGU]>; def : WriteRes<WriteMove, [JALU01]>; // Load/store MXCSR. -// FIXME: These are copy and pasted from WriteLoad/Store. -def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } def : WriteRes<WriteSTMXCSR, [JSAGU]>; // Treat misc copies as a move. @@ -400,8 +404,8 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>; defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; defm : X86WriteResPairUnsupported<WriteFShuffleZ>; -defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 2, [1, 4], 3>; -defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 3, [2, 6], 6>; +defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. +defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; @@ -425,12 +429,13 @@ defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; -// FIXME: f+3 ST, LD+STC latency -defm : JWriteResFpuPair<WriteCvtI2SS, [JFPU1, JSTC], 9, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; -defm : JWriteResFpuPair<WriteCvtI2SD, [JFPU1, JSTC], 9, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; @@ -487,11 +492,11 @@ defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; defm : X86WriteResPairUnsupported<WriteVecALUY>; defm : X86WriteResPairUnsupported<WriteVecALUZ>; defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; -defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. defm : X86WriteResPairUnsupported<WriteVecShiftY>; defm : X86WriteResPairUnsupported<WriteVecShiftZ>; defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; -defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; defm : X86WriteResPairUnsupported<WriteVarVecShift>; @@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; @@ -575,10 +580,10 @@ defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1, // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>; -defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>; -defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; -defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. +defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. +defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. defm : X86WriteResPairUnsupported<WritePHAddY>; //////////////////////////////////////////////////////////////////////////////// diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td index fcaff7cf810f..34c251a5c5bb 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -1,9 +1,8 @@ //=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -53,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>; def : ReadAdvance<ReadAfterVecXLd, 3>; def : ReadAdvance<ReadAfterVecYLd, 3>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. @@ -130,7 +131,6 @@ defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>; defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>; defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>; -defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>; defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move. def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>; def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> { diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td index a866f843106b..65f6d89df610 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -1,9 +1,8 @@ //=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -95,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>; def : ReadAdvance<ReadAfterVecXLd, 8>; def : ReadAdvance<ReadAfterVecYLd, 8>; +def : ReadAdvance<ReadInt2Fpu, 0>; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" @@ -214,7 +215,6 @@ defm : ZnWriteResPair<WriteJump, [ZnALU], 1>; defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>; defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>; -defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>; def : WriteRes<WriteSETCC, [ZnALU]>; def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>; defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 008a9ec2ba3c..50690953eef5 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -1,9 +1,8 @@ //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -44,24 +43,6 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( return false; } -namespace { - -// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT -// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is -// always smaller than the block size). -struct RepMovsRepeats { - RepMovsRepeats(uint64_t Size) : Size(Size) {} - - uint64_t Count() const { return Size / UBytes(); } - uint64_t BytesLeft() const { return Size % UBytes(); } - uint64_t UBytes() const { return AVT.getSizeInBits() / 8; } - - const uint64_t Size; - MVT AVT = MVT::i8; -}; - -} // namespace - SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, SDValue Size, unsigned Align, bool isVolatile, @@ -201,98 +182,137 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( return Chain; } -SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( - SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - const X86Subtarget &Subtarget = - DAG.getMachineFunction().getSubtarget<X86Subtarget>(); - if (!ConstantSize) - return SDValue(); - RepMovsRepeats Repeats(ConstantSize->getZExtValue()); - if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold()) +/// Emit a single REP MOVS{B,W,D,Q} instruction. +static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, MVT AVT) { + const bool Use64BitRegs = Subtarget.isTarget64BitLP64(); + const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX; + const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI; + const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI; + + SDValue InFlag; + Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag}; + return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); +} + +/// Emit a single REP MOVSB instruction for a particular constant size. +static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl, SDValue Chain, SDValue Dst, + SDValue Src, uint64_t Size) { + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, + DAG.getIntPtrConstant(Size, dl), MVT::i8); +} + +/// Returns the best type to use with repmovs depending on alignment. +static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget, + uint64_t Align) { + assert((Align != 0) && "Align is normalized"); + assert(isPowerOf2_64(Align) && "Align is a power of 2"); + switch (Align) { + case 1: + return MVT::i8; + case 2: + return MVT::i16; + case 4: + return MVT::i32; + default: + return Subtarget.is64Bit() ? MVT::i64 : MVT::i32; + } +} + +/// Returns a REP MOVS instruction, possibly with a few load/stores to implement +/// a constant size memory copy. In some cases where we know REP MOVS is +/// inefficient we return an empty SDValue so the calling code can either +/// generate a load/store sequence or call the runtime memcpy function. +static SDValue emitConstantSizeRepmov( + SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT, + unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { + + /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very + /// efficient. + if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); - /// If not DWORD aligned, it is more efficient to call the library. However - /// if calling the library is not allowed (AlwaysInline), then soldier on as - /// the code generated here is better than the long load-store sequence we - /// would otherwise get. + /// If we have enhanced repmovs we use it. + if (Subtarget.hasERMSB()) + return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); + + assert(!Subtarget.hasERMSB() && "No efficient RepMovs"); + /// We assume runtime memcpy will do a better job for unaligned copies when + /// ERMS is not present. if (!AlwaysInline && (Align & 3) != 0) return SDValue(); + const MVT BlockType = getOptimalRepmovsType(Subtarget, Align); + const uint64_t BlockBytes = BlockType.getSizeInBits() / 8; + const uint64_t BlockCount = Size / BlockBytes; + const uint64_t BytesLeft = Size % BlockBytes; + SDValue RepMovs = + emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, + DAG.getIntPtrConstant(BlockCount, dl), BlockType); + + /// RepMov can process the whole length. + if (BytesLeft == 0) + return RepMovs; + + assert(BytesLeft && "We have leftover at this point"); + + /// In case we optimize for size we use repmovsb even if it's less efficient + /// so we can save the loads/stores of the leftover. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size); + + // Handle the last 1 - 7 bytes. + SmallVector<SDValue, 4> Results; + Results.push_back(RepMovs); + unsigned Offset = Size - BytesLeft; + EVT DstVT = Dst.getValueType(); + EVT SrcVT = Src.getValueType(); + Results.push_back(DAG.getMemcpy( + Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), + DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), + DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile, + /*AlwaysInline*/ true, /*isTailCall*/ false, + DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); +} + +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // If to a segment-relative address space, use the default lowering. - if (DstPtrInfo.getAddrSpace() >= 256 || - SrcPtrInfo.getAddrSpace() >= 256) + if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); - // If the base register might conflict with our physical registers, bail out. + // If the base registers conflict with our physical registers, use the default + // lowering. const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); - // If the target has enhanced REPMOVSB, then it's at least as fast to use - // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle - // BytesLeft. - if (!Subtarget.hasERMSB() && !(Align & 1)) { - if (Align & 2) - // WORD aligned - Repeats.AVT = MVT::i16; - else if (Align & 4) - // DWORD aligned - Repeats.AVT = MVT::i32; - else - // QWORD aligned - Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; - - if (Repeats.BytesLeft() > 0 && - DAG.getMachineFunction().getFunction().optForMinSize()) { - // When aggressively optimizing for size, avoid generating the code to - // handle BytesLeft. - Repeats.AVT = MVT::i8; - } - } - - bool Use64BitRegs = Subtarget.isTarget64BitLP64(); - SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, - DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, - Dst, InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RSI : X86::ESI, - Src, InFlag); - InFlag = Chain.getValue(1); - - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag }; - SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); - SmallVector<SDValue, 4> Results; - Results.push_back(RepMovs); - if (Repeats.BytesLeft()) { - // Handle the last 1 - 7 bytes. - unsigned Offset = Repeats.Size - Repeats.BytesLeft(); - EVT DstVT = Dst.getValueType(); - EVT SrcVT = Src.getValueType(); - EVT SizeVT = Size.getValueType(); - Results.push_back(DAG.getMemcpy(Chain, dl, - DAG.getNode(ISD::ADD, dl, DstVT, Dst, - DAG.getConstant(Offset, dl, - DstVT)), - DAG.getNode(ISD::ADD, dl, SrcVT, Src, - DAG.getConstant(Offset, dl, - SrcVT)), - DAG.getConstant(Repeats.BytesLeft(), dl, - SizeVT), - Align, isVolatile, AlwaysInline, false, - DstPtrInfo.getWithOffset(Offset), - SrcPtrInfo.getWithOffset(Offset))); - } + /// Handle constant sizes, + if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size)) + return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src, + ConstantSize->getZExtValue(), + Size.getValueType(), Align, isVolatile, + AlwaysInline, DstPtrInfo, SrcPtrInfo); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); + return SDValue(); } diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h index f4a285a5f916..0f2d979f91e3 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -1,9 +1,8 @@ //===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 720be8afa62c..a202fc63637b 100644 --- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -1,9 +1,8 @@ //===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h index b08c31935d28..296341517579 100644 --- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -1,9 +1,8 @@ //===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index a729161a1beb..40f5dbe57e4b 100644 --- a/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -1,9 +1,8 @@ //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -123,10 +122,7 @@ namespace { class X86SpeculativeLoadHardeningPass : public MachineFunctionPass { public: - X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { - initializeX86SpeculativeLoadHardeningPassPass( - *PassRegistry::getPassRegistry()); - } + X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { } StringRef getPassName() const override { return "X86 speculative load hardening"; @@ -661,7 +657,7 @@ X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) { // jmpq *%rax // ``` // We still want to harden the edge to `L1`. - if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) { + if (X86::getCondFromBranch(MI) == X86::COND_INVALID) { Info.CondBrs.clear(); Info.UncondBr = &MI; continue; @@ -752,7 +748,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( for (X86::CondCode Cond : Conds) { int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); // Note that we intentionally use an empty debug location so that @@ -760,7 +756,8 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(CurStateReg) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(Cond); // If this is the last cmov and the EFLAGS weren't originally // live-in, mark them as killed. if (!LiveEFLAGS && Cond == Conds.back()) @@ -789,7 +786,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB(); int &SuccCount = SuccCounts[&Succ]; - X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode()); + X86::CondCode Cond = X86::getCondFromBranch(*CondBr); X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond); UncondCodeSeq.push_back(Cond); @@ -1177,12 +1174,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // Now cmov over the predicate if the comparison wasn't equal. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(PS->InitialReg) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(X86::COND_NE); CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); ++NumInstsInserted; LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n"); @@ -1963,6 +1961,14 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( LLVM_DEBUG( dbgs() << " Skipping hardening base of explicit stack frame load: "; MI.dump(); dbgs() << "\n"); + } else if (BaseMO.getReg() == X86::RSP) { + // Some idempotent atomic operations are lowered directly to a locked + // OR with 0 to the top of stack(or slightly offset from top) which uses an + // explicit RSP register as the base. + assert(IndexMO.getReg() == X86::NoRegister && + "Explicit RSP access with dynamic index!"); + LLVM_DEBUG( + dbgs() << " Cannot harden base of explicit RSP offset in a load!"); } else if (BaseMO.getReg() == X86::RIP || BaseMO.getReg() == X86::NoRegister) { // For both RIP-relative addressed loads or absolute loads, we cannot @@ -2464,7 +2470,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( // If we have no red zones or if the function returns twice (possibly without // using the `ret` instruction) like setjmp, we need to save the expected // return address prior to the call. - if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone) || + if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) || MF.exposesReturnsTwice()) { // If we don't have red zones, we need to compute the expected return // address prior to the call and store it in a register that lives across @@ -2546,12 +2552,13 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( // Now conditionally update the predicate state we just extracted if we ended // up at a different return address than expected. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; - auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes); + auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg) .addReg(NewStateReg, RegState::Kill) - .addReg(PS->PoisonReg); + .addReg(PS->PoisonReg) + .addImm(X86::COND_NE); CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); ++NumInstsInserted; LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n"); diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 0c9ce8802e1b..d5bb56603df9 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -1,9 +1,8 @@ //===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,6 +14,7 @@ #include "X86CallLowering.h" #include "X86LegalizerInfo.h" +#include "X86MacroFusion.h" #include "X86RegisterBankInfo.h" #include "X86Subtarget.h" #include "MCTargetDesc/X86BaseInfo.h" @@ -176,10 +176,13 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; + // Functions on COFF can be non-DSO local for two reasons: + // - They are marked dllimport + // - They are extern_weak, and a stub is needed if (isTargetCOFF()) { - assert(GV->hasDLLImportStorageClass() && - "shouldAssumeDSOLocal gave inconsistent answer"); - return X86II::MO_DLLIMPORT; + if (GV->hasDLLImportStorageClass()) + return X86II::MO_DLLIMPORT; + return X86II::MO_COFFSTUB; } const Function *F = dyn_cast_or_null<Function>(GV); @@ -367,3 +370,8 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } + +void X86Subtarget::getPostRAMutations( + std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { + Mutations.push_back(createX86MacroFusionDAGMutation()); +} diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index b1103f823e7f..24ccc9cb7843 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -1,9 +1,8 @@ //===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -89,6 +88,9 @@ protected: /// True if the processor supports X87 instructions. bool HasX87 = false; + /// True if the processor supports CMPXCHG8B. + bool HasCmpxchg8b = false; + /// True if this processor has NOPL instruction /// (generally pentium pro+). bool HasNOPL = false; @@ -295,6 +297,9 @@ protected: /// True if the processor supports macrofusion. bool HasMacroFusion = false; + /// True if the processor supports branch fusion. + bool HasBranchFusion = false; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; @@ -348,9 +353,18 @@ protected: /// Processor has AVX-512 Vector Neural Network Instructions bool HasVNNI = false; + /// Processor has AVX-512 bfloat16 floating-point extensions + bool HasBF16 = false; + + /// Processor supports ENQCMD instructions + bool HasENQCMD = false; + /// Processor has AVX-512 Bit Algorithms instructions bool HasBITALG = false; + /// Processor has AVX-512 vp2intersect instructions + bool HasVP2INTERSECT = false; + /// Processor supports MPX - Memory Protection Extensions bool HasMPX = false; @@ -388,6 +402,12 @@ protected: /// Try harder to combine to horizontal vector ops if they are fast. bool HasFastHorizontalOps = false; + /// Prefer a left/right scalar logical shifts pair over a shift+and pair. + bool HasFastScalarShiftMasks = false; + + /// Prefer a left/right vector logical shifts pair over a shift+and pair. + bool HasFastVectorShiftMasks = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -547,6 +567,7 @@ public: void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } bool hasX87() const { return HasX87; } + bool hasCmpxchg8b() const { return HasCmpxchg8b; } bool hasNOPL() const { return HasNOPL; } // SSE codegen depends on cmovs, and all SSE1+ processors support them. // All 64-bit processors support cmov. @@ -621,7 +642,7 @@ public: int getGatherOverhead() const { return GatherOverhead; } int getScatterOverhead() const { return ScatterOverhead; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } - bool hasCmpxchg16b() const { return HasCmpxchg16b; } + bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } bool useLeaForSP() const { return UseLeaForSP; } bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } @@ -638,7 +659,10 @@ public: bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } + bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } + bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } bool hasMacroFusion() const { return HasMacroFusion; } + bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } @@ -657,6 +681,8 @@ public: bool hasVLX() const { return HasVLX; } bool hasPKU() const { return HasPKU; } bool hasVNNI() const { return HasVNNI; } + bool hasBF16() const { return HasBF16; } + bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } bool hasBITALG() const { return HasBITALG; } bool hasMPX() const { return HasMPX; } bool hasSHSTK() const { return HasSHSTK; } @@ -669,6 +695,7 @@ public: bool hasSGX() const { return HasSGX; } bool threewayBranchProfitable() const { return ThreewayBranchProfitable; } bool hasINVPCID() const { return HasINVPCID; } + bool hasENQCMD() const { return HasENQCMD; } bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } bool useRetpolineIndirectBranches() const { return UseRetpolineIndirectBranches; @@ -744,10 +771,6 @@ public: return TargetTriple.isWindowsMSVCEnvironment(); } - bool isTargetKnownWindowsMSVC() const { - return TargetTriple.isKnownWindowsMSVCEnvironment(); - } - bool isTargetWindowsCoreCLR() const { return TargetTriple.isWindowsCoreCLREnvironment(); } @@ -834,11 +857,11 @@ public: /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } - // TODO: Update the regression tests and return true. - bool supportPrintSchedInfo() const override { return false; } - bool enableEarlyIfConversion() const override; + void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> + &Mutations) const override; + AntiDepBreakMode getAntiDepBreakMode() const override { return TargetSubtargetInfo::ANTIDEP_CRITICAL; } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 217a12ddf896..0cbf13899a29 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -1,9 +1,8 @@ //===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,6 +12,7 @@ #include "X86TargetMachine.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "TargetInfo/X86TargetInfo.h" #include "X86.h" #include "X86CallLowering.h" #include "X86LegalizerInfo.h" @@ -71,9 +71,10 @@ extern "C" void LLVMInitializeX86Target() { initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); initializeFixupLEAPassPass(PR); - initializeShadowCallStackPass(PR); + initializeFPSPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); + initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); @@ -195,7 +196,7 @@ static CodeModel::Model getEffectiveX86CodeModel(Optional<CodeModel::Model> CM, bool JIT, bool Is64Bit) { if (CM) { if (*CM == CodeModel::Tiny) - report_fatal_error("Target does not support the tiny CodeModel"); + report_fatal_error("Target does not support the tiny CodeModel", false); return *CM; } if (JIT) @@ -358,6 +359,13 @@ public: return DAG; } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + DAG->addMutation(createX86MacroFusionDAGMutation()); + return DAG; + } + void addIRPasses() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -372,6 +380,8 @@ public: void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; + + std::unique_ptr<CSEConfigBase> getCSEConfig() const override; }; class X86ExecutionDomainFix : public ExecutionDomainFix { @@ -491,7 +501,6 @@ void X86PassConfig::addPreEmitPass() { addPass(createBreakFalseDeps()); } - addPass(createShadowCallStackPass()); addPass(createX86IndirectBranchTrackingPass()); if (UseVZeroUpper) @@ -519,3 +528,7 @@ void X86PassConfig::addPreEmitPass2() { MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI)) addPass(createCFIInstrInserter()); } + +std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index f5b45da0c3dc..b999e2e86af6 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -1,9 +1,8 @@ //===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 505c4fa07b77..92e0779c2e74 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index d045094edb1e..13d7b4ad70d6 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -1,9 +1,8 @@ //===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 36929a4f5439..3dc59aeb263e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -1651,17 +1650,77 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry SSE2CostTbl[] = { - { ISD::SETCC, MVT::v2i64, 8 }, - { ISD::SETCC, MVT::v4i32, 1 }, - { ISD::SETCC, MVT::v8i16, 1 }, - { ISD::SETCC, MVT::v16i8, 1 }, + unsigned ExtraCost = 0; + if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) { + // Some vector comparison predicates cost extra instructions. + if (MTy.isVector() && + !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || + (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || + ST->hasBWI())) { + switch (cast<CmpInst>(I)->getPredicate()) { + case CmpInst::Predicate::ICMP_NE: + // xor(cmpeq(x,y),-1) + ExtraCost = 1; + break; + case CmpInst::Predicate::ICMP_SGE: + case CmpInst::Predicate::ICMP_SLE: + // xor(cmpgt(x,y),-1) + ExtraCost = 1; + break; + case CmpInst::Predicate::ICMP_ULT: + case CmpInst::Predicate::ICMP_UGT: + // cmpgt(xor(x,signbit),xor(y,signbit)) + // xor(cmpeq(pmaxu(x,y),x),-1) + ExtraCost = 2; + break; + case CmpInst::Predicate::ICMP_ULE: + case CmpInst::Predicate::ICMP_UGE: + if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || + (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { + // cmpeq(psubus(x,y),0) + // cmpeq(pminu(x,y),x) + ExtraCost = 1; + } else { + // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) + ExtraCost = 3; + } + break; + default: + break; + } + } + } + + static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::SETCC, MVT::v32i16, 1 }, + { ISD::SETCC, MVT::v64i8, 1 }, + + { ISD::SELECT, MVT::v32i16, 1 }, + { ISD::SELECT, MVT::v64i8, 1 }, }; - static const CostTblEntry SSE42CostTbl[] = { - { ISD::SETCC, MVT::v2f64, 1 }, - { ISD::SETCC, MVT::v4f32, 1 }, - { ISD::SETCC, MVT::v2i64, 1 }, + static const CostTblEntry AVX512CostTbl[] = { + { ISD::SETCC, MVT::v8i64, 1 }, + { ISD::SETCC, MVT::v16i32, 1 }, + { ISD::SETCC, MVT::v8f64, 1 }, + { ISD::SETCC, MVT::v16f32, 1 }, + + { ISD::SELECT, MVT::v8i64, 1 }, + { ISD::SELECT, MVT::v16i32, 1 }, + { ISD::SELECT, MVT::v8f64, 1 }, + { ISD::SELECT, MVT::v16f32, 1 }, + }; + + static const CostTblEntry AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + + { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb + { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb + { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb + { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb }; static const CostTblEntry AVX1CostTbl[] = { @@ -1672,50 +1731,83 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SETCC, MVT::v8i32, 4 }, { ISD::SETCC, MVT::v16i16, 4 }, { ISD::SETCC, MVT::v32i8, 4 }, + + { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd + { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps + { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd + { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps + { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps + { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps }; - static const CostTblEntry AVX2CostTbl[] = { - { ISD::SETCC, MVT::v4i64, 1 }, - { ISD::SETCC, MVT::v8i32, 1 }, - { ISD::SETCC, MVT::v16i16, 1 }, - { ISD::SETCC, MVT::v32i8, 1 }, + static const CostTblEntry SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, }; - static const CostTblEntry AVX512CostTbl[] = { - { ISD::SETCC, MVT::v8i64, 1 }, - { ISD::SETCC, MVT::v16i32, 1 }, - { ISD::SETCC, MVT::v8f64, 1 }, - { ISD::SETCC, MVT::v16f32, 1 }, + static const CostTblEntry SSE41CostTbl[] = { + { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd + { ISD::SELECT, MVT::v4f32, 1 }, // blendvps + { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb + { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb + { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb + { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb }; - static const CostTblEntry AVX512BWCostTbl[] = { - { ISD::SETCC, MVT::v32i16, 1 }, - { ISD::SETCC, MVT::v64i8, 1 }, + static const CostTblEntry SSE2CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 2 }, + { ISD::SETCC, MVT::f64, 1 }, + { ISD::SETCC, MVT::v2i64, 8 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + + { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por + { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por + }; + + static const CostTblEntry SSE1CostTbl[] = { + { ISD::SETCC, MVT::v4f32, 2 }, + { ISD::SETCC, MVT::f32, 1 }, + + { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps }; if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) - return LT.first * Entry->Cost; + return LT.first * (ExtraCost + Entry->Cost); + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } @@ -1784,6 +1876,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq + { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd + { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq + { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq + { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, @@ -1825,6 +1921,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::SSUBSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v16i16, 1 }, { ISD::UADDSAT, MVT::v32i8, 1 }, + { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd { ISD::USUBSAT, MVT::v16i16, 1 }, { ISD::USUBSAT, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd @@ -1861,6 +1958,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert @@ -1885,6 +1983,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, }; static const CostTblEntry SSE42CostTbl[] = { { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd + { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; @@ -1945,14 +2044,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::BITREVERSE, MVT::i64, 14 } + { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::SADDO, MVT::i64, 1 }, + { ISD::UADDO, MVT::i64, 1 }, }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, - { ISD::BITREVERSE, MVT::i8, 11 } + { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::SADDO, MVT::i32, 1 }, + { ISD::SADDO, MVT::i16, 1 }, + { ISD::SADDO, MVT::i8, 1 }, + { ISD::UADDO, MVT::i32, 1 }, + { ISD::UADDO, MVT::i16, 1 }, + { ISD::UADDO, MVT::i8, 1 }, }; + Type *OpTy = RetTy; unsigned ISD = ISD::DELETED_NODE; switch (IID) { default: @@ -1987,11 +2095,23 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, case Intrinsic::sqrt: ISD = ISD::FSQRT; break; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + // SSUBO has same costs so don't duplicate. + ISD = ISD::SADDO; + OpTy = RetTy->getContainedType(0); + break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: + // USUBO has same costs so don't duplicate. + ISD = ISD::UADDO; + OpTy = RetTy->getContainedType(0); + break; } if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -2226,6 +2346,9 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned Alignment, unsigned AddressSpace) { + bool IsLoad = (Instruction::Load == Opcode); + bool IsStore = (Instruction::Store == Opcode); + VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask @@ -2233,10 +2356,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = - VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || - (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || - !isPowerOf2_32(NumElem)) { + VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); + if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || + (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { // Scalarization int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); int ScalarCompareCost = getCmpSelInstrCost( @@ -2244,8 +2366,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, int BranchCost = getCFInstrCost(Instruction::Br); int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - int ValueSplitCost = getScalarizationOverhead( - SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore); int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); @@ -2259,8 +2380,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -2268,11 +2389,13 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); } + + // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. if (!ST->hasAVX512()) - return Cost + LT.first*4; // Each maskmov costs 4 + return Cost + LT.first * (IsLoad ? 2 : 8); // AVX-512 masked load/store is cheapper - return Cost+LT.first; + return Cost + LT.first; } int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -2281,7 +2404,7 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. - unsigned NumVectorInstToHideOverhead = 10; + const unsigned NumVectorInstToHideOverhead = 10; // Cost modeling of Strided Access Computation is hidden by the indexing // modes of X86 regardless of the stride value. We dont believe that there @@ -2369,6 +2492,48 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, return LT.first * Entry->Cost; } + static const CostTblEntry AVX2BoolReduction[] = { + { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp + { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp + { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp + { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp + }; + + static const CostTblEntry AVX1BoolReduction[] = { + { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp + { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp + { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp + { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp + { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp + { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp + { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp + { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp + }; + + static const CostTblEntry SSE2BoolReduction[] = { + { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp + { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp + { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp + { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp + { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp + { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp + { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp + { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp + }; + + // Handle bool allof/anyof patterns. + if (ValTy->getVectorElementType()->isIntegerTy(1)) { + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; + } + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); } @@ -2390,15 +2555,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SSE42CostTblPairWise[] = { + static const CostTblEntry SSE1CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 4}, + }; + + static const CostTblEntry SSE2CostTblPairWise[] = { {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::SMIN, MVT::v2i64, 6}, + {ISD::UMIN, MVT::v2i64, 8}, + {ISD::SMIN, MVT::v4i32, 6}, + {ISD::UMIN, MVT::v4i32, 8}, + {ISD::SMIN, MVT::v8i16, 4}, + {ISD::UMIN, MVT::v8i16, 6}, + {ISD::SMIN, MVT::v16i8, 8}, + {ISD::UMIN, MVT::v16i8, 6}, + }; + + static const CostTblEntry SSE41CostTblPairWise[] = { {ISD::FMINNUM, MVT::v4f32, 2}, - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v2i64, 9}, + {ISD::UMIN, MVT::v2i64,10}, {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" {ISD::SMIN, MVT::v8i16, 2}, {ISD::UMIN, MVT::v8i16, 2}, + {ISD::SMIN, MVT::v16i8, 3}, + {ISD::UMIN, MVT::v16i8, 3}, + }; + + static const CostTblEntry SSE42CostTblPairWise[] = { + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" }; static const CostTblEntry AVX1CostTblPairWise[] = { @@ -2411,8 +2598,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v16i8, 2}, + {ISD::UMIN, MVT::v16i8, 2}, + {ISD::SMIN, MVT::v4i64, 7}, + {ISD::UMIN, MVT::v4i64, 7}, {ISD::SMIN, MVT::v8i32, 3}, {ISD::UMIN, MVT::v8i32, 3}, + {ISD::SMIN, MVT::v16i16, 3}, + {ISD::UMIN, MVT::v16i16, 3}, + {ISD::SMIN, MVT::v32i8, 3}, + {ISD::UMIN, MVT::v32i8, 3}, }; static const CostTblEntry AVX2CostTblPairWise[] = { @@ -2435,15 +2630,37 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v16i32, 1}, }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE1CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 4}, + }; + + static const CostTblEntry SSE2CostTblNoPairWise[] = { {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::SMIN, MVT::v2i64, 6}, + {ISD::UMIN, MVT::v2i64, 8}, + {ISD::SMIN, MVT::v4i32, 6}, + {ISD::UMIN, MVT::v4i32, 8}, + {ISD::SMIN, MVT::v8i16, 4}, + {ISD::UMIN, MVT::v8i16, 6}, + {ISD::SMIN, MVT::v16i8, 8}, + {ISD::UMIN, MVT::v16i8, 6}, + }; + + static const CostTblEntry SSE41CostTblNoPairWise[] = { {ISD::FMINNUM, MVT::v4f32, 3}, - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v2i64, 9}, + {ISD::UMIN, MVT::v2i64,11}, {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" + {ISD::SMIN, MVT::v16i8, 3}, + {ISD::UMIN, MVT::v16i8, 3}, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" }; static const CostTblEntry AVX1CostTblNoPairWise[] = { @@ -2456,8 +2673,16 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, {ISD::UMIN, MVT::v4i32, 1}, {ISD::SMIN, MVT::v8i16, 1}, {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v16i8, 2}, + {ISD::UMIN, MVT::v16i8, 2}, + {ISD::SMIN, MVT::v4i64, 7}, + {ISD::UMIN, MVT::v4i64, 7}, {ISD::SMIN, MVT::v8i32, 2}, {ISD::UMIN, MVT::v8i32, 2}, + {ISD::SMIN, MVT::v16i16, 2}, + {ISD::UMIN, MVT::v16i16, 2}, + {ISD::SMIN, MVT::v32i8, 2}, + {ISD::UMIN, MVT::v32i8, 2}, }; static const CostTblEntry AVX2CostTblNoPairWise[] = { @@ -2496,6 +2721,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } else { if (ST->hasAVX512()) if (const auto *Entry = @@ -2513,6 +2750,18 @@ int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); @@ -2864,26 +3113,106 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, } bool X86TTIImpl::canMacroFuseCmp() { - return ST->hasMacroFusion(); + return ST->hasMacroFusion() || ST->hasBranchFusion(); } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + if (!ST->hasAVX()) + return false; + // The backend can't handle a single element vector. if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) return false; Type *ScalarTy = DataTy->getScalarType(); - int DataWidth = isa<PointerType>(ScalarTy) ? - DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || - ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); + if (ScalarTy->isPointerTy()) + return true; + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64 || + ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { return isLegalMaskedLoad(DataType); } +bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + // The only supported nontemporal loads are for aligned vectors of 16 or 32 + // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 + // (the equivalent stores only require AVX). + if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) + return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); + + return false; +} + +bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + + // SSE4A supports nontemporal stores of float and double at arbitrary + // alignment. + if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) + return true; + + // Besides the SSE4A subtarget exception above, only aligned stores are + // available nontemporaly on any other subtarget. And only stores with a size + // of 4..32 bytes (powers of 2, only) are permitted. + if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || + !isPowerOf2_32(DataSize)) + return false; + + // 32-byte vector nontemporal stores are supported by AVX (the equivalent + // loads require AVX2). + if (DataSize == 32) + return ST->hasAVX(); + else if (DataSize == 16) + return ST->hasSSE1(); + return true; +} + +bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { + if (!isa<VectorType>(DataTy)) + return false; + + if (!ST->hasAVX512()) + return false; + + // The backend can't handle a single element vector. + if (DataTy->getVectorNumElements() == 1) + return false; + + Type *ScalarTy = DataTy->getVectorElementType(); + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64 || + ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); +} + +bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { + return isLegalMaskedExpandLoad(DataTy); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // Some CPUs have better gather performance than others. + // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()))) + return false; + // This function is called now in two cases: from the Loop Vectorizer // and from the Scalarizer. // When the Loop Vectorizer asks about legality of the feature, @@ -2902,14 +3231,17 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { return false; } Type *ScalarTy = DataTy->getScalarType(); - int DataWidth = isa<PointerType>(ScalarTy) ? - DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + if (ScalarTy->isPointerTy()) + return true; - // Some CPUs have better gather performance than others. - // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only - // enable gather with a -march. - return (DataWidth == 32 || DataWidth == 64) && - (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64; } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { @@ -2938,44 +3270,51 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, const FeatureBitset &CalleeBits = TM.getSubtargetImpl(*Callee)->getFeatureBits(); - // FIXME: This is likely too limiting as it will include subtarget features - // that we might not care about for inlining, but it is conservatively - // correct. - return (CallerBits & CalleeBits) == CalleeBits; + FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; + FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; + return (RealCallerBits & RealCalleeBits) == RealCalleeBits; } -const X86TTIImpl::TTI::MemCmpExpansionOptions * -X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - // Only enable vector loads for equality comparison. - // Right now the vector version is not as fast, see #33329. - static const auto ThreeWayOptions = [this]() { - TTI::MemCmpExpansionOptions Options; - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - static const auto EqZeroOptions = [this]() { - TTI::MemCmpExpansionOptions Options; +bool X86TTIImpl::areFunctionArgsABICompatible( + const Function *Caller, const Function *Callee, + SmallPtrSetImpl<Argument *> &Args) const { + if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) + return false; + + // If we get here, we know the target features match. If one function + // considers 512-bit vectors legal and the other does not, consider them + // incompatible. + // FIXME Look at the arguments and only consider 512 bit or larger vectors? + const TargetMachine &TM = getTLI()->getTargetMachine(); + + return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == + TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs(); +} + +X86TTIImpl::TTI::MemCmpExpansionOptions +X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = 2; + if (IsZeroCmp) { + // Only enable vector loads for equality comparison. Right now the vector + // version is not as fast for three way compare (see #33329). // TODO: enable AVX512 when the DAG is ready. // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); - if (ST->hasAVX2()) Options.LoadSizes.push_back(32); - if (ST->hasSSE2()) Options.LoadSizes.push_back(16); - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); + const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer // vectors (SSE2/AVX2). Options.AllowOverlappingLoads = true; - return Options; - }(); - return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; + } + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; } bool X86TTIImpl::enableInterleavedAccessVectorization() { diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index 1637592c81f8..25d9c33eb16d 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -1,9 +1,8 @@ //===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -36,6 +35,64 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } + const FeatureBitset InlineFeatureIgnoreList = { + // This indicates the CPU is 64 bit capable not that we are in 64-bit + // mode. + X86::Feature64Bit, + + // These features don't have any intrinsics or ABI effect. + X86::FeatureNOPL, + X86::FeatureCMPXCHG16B, + X86::FeatureLAHFSAHF, + + // Codegen control options. + X86::FeatureFast11ByteNOP, + X86::FeatureFast15ByteNOP, + X86::FeatureFastBEXTR, + X86::FeatureFastHorizontalOps, + X86::FeatureFastLZCNT, + X86::FeatureFastPartialYMMorZMMWrite, + X86::FeatureFastScalarFSQRT, + X86::FeatureFastSHLDRotate, + X86::FeatureFastScalarShiftMasks, + X86::FeatureFastVectorShiftMasks, + X86::FeatureFastVariableShuffle, + X86::FeatureFastVectorFSQRT, + X86::FeatureLEAForSP, + X86::FeatureLEAUsesAG, + X86::FeatureLZCNTFalseDeps, + X86::FeatureBranchFusion, + X86::FeatureMacroFusion, + X86::FeatureMergeToThreeWayBranch, + X86::FeaturePadShortFunctions, + X86::FeaturePOPCNTFalseDeps, + X86::FeatureSSEUnalignedMem, + X86::FeatureSlow3OpsLEA, + X86::FeatureSlowDivide32, + X86::FeatureSlowDivide64, + X86::FeatureSlowIncDec, + X86::FeatureSlowLEA, + X86::FeatureSlowPMADDWD, + X86::FeatureSlowPMULLD, + X86::FeatureSlowSHLD, + X86::FeatureSlowTwoMemOps, + X86::FeatureSlowUAMem16, + + // Perf-tuning flags. + X86::FeatureHasFastGather, + X86::FeatureSlowUAMem32, + + // Based on whether user set the -mprefer-vector-width command line. + X86::FeaturePrefer256Bit, + + // CPU name enums. These just follow CPU string. + X86::ProcIntelAtom, + X86::ProcIntelGLM, + X86::ProcIntelGLP, + X86::ProcIntelSLM, + X86::ProcIntelTRM, + }; + public: explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -129,14 +186,21 @@ public: bool canMacroFuseCmp(); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); + bool isLegalNTLoad(Type *DataType, unsigned Alignment); + bool isLegalNTStore(Type *DataType, unsigned Alignment); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); + bool isLegalMaskedExpandLoad(Type *DataType); + bool isLegalMaskedCompressStore(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const; + bool areFunctionArgsABICompatible(const Function *Caller, + const Function *Callee, + SmallPtrSetImpl<Argument *> &Args) const; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index f882b760927c..a07d2f20acab 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -1,9 +1,8 @@ //===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp index d298aaa97ecd..9e499db1d7ee 100644 --- a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp @@ -1,9 +1,8 @@ //===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -85,10 +84,6 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { unsigned AmountReg = MI->getOperand(0).getReg(); MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg); - // Look through copies. - while (Def && Def->isCopy() && Def->getOperand(1).isReg()) - Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg()); - if (!Def || (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) || !Def->getOperand(1).isImm()) @@ -210,15 +205,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { return; } + // These two variables differ on x32, which is a 64-bit target with a + // 32-bit alloca. bool Is64Bit = STI->is64Bit(); + bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64; assert(SlotSize == 4 || SlotSize == 8); - unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX; switch (L) { - case TouchAndSub: + case TouchAndSub: { assert(Amount >= SlotSize); // Use a push to touch the top of the stack. + unsigned RegA = Is64Bit ? X86::RAX : X86::EAX; BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(RegA, RegState::Undef); Amount -= SlotSize; @@ -227,15 +225,18 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { // Fall through to make any remaining adjustment. LLVM_FALLTHROUGH; + } case Sub: assert(Amount > 0); if (Amount == SlotSize) { // Use push to save size. + unsigned RegA = Is64Bit ? X86::RAX : X86::EAX; BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(RegA, RegState::Undef); } else { // Sub. - BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr) + BuildMI(*MBB, I, DL, + TII->get(getSubOpcode(Is64BitAlloca, Amount)), StackPtr) .addReg(StackPtr) .addImm(Amount); } @@ -243,16 +244,17 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { case Probe: if (!NoStackArgProbe) { // The probe lowering expects the amount in RAX/EAX. + unsigned RegA = Is64BitAlloca ? X86::RAX : X86::EAX; BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA) .addReg(MI->getOperand(0).getReg()); // Do the probe. STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL, - /*InPrologue=*/false); + /*InProlog=*/false); } else { // Sub - BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::SUB64rr : X86::SUB32rr), - StackPtr) + BuildMI(*MBB, I, DL, + TII->get(Is64BitAlloca ? X86::SUB64rr : X86::SUB32rr), StackPtr) .addReg(StackPtr) .addReg(MI->getOperand(0).getReg()); } @@ -262,18 +264,10 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { unsigned AmountReg = MI->getOperand(0).getReg(); MI->eraseFromParent(); - // Delete the definition of AmountReg, possibly walking a chain of copies. - for (;;) { - if (!MRI->use_empty(AmountReg)) - break; - MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg); - if (!AmountDef) - break; - if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg()) - AmountReg = AmountDef->getOperand(1).isReg(); - AmountDef->eraseFromParent(); - break; - } + // Delete the definition of AmountReg. + if (MRI->use_empty(AmountReg)) + if (MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg)) + AmountDef->eraseFromParent(); } bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) { diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp index 185deda97c1f..f68d17d7256d 100644 --- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -1,9 +1,8 @@ //===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,9 +40,7 @@ class WinEHStatePass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHStatePass() : FunctionPass(ID) { - initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); - } + WinEHStatePass() : FunctionPass(ID) { } bool runOnFunction(Function &Fn) override; @@ -87,15 +84,15 @@ private: StructType *EHLinkRegistrationTy = nullptr; StructType *CXXEHRegistrationTy = nullptr; StructType *SEHRegistrationTy = nullptr; - Constant *SetJmp3 = nullptr; - Constant *CxxLongjmpUnwind = nullptr; + FunctionCallee SetJmp3 = nullptr; + FunctionCallee CxxLongjmpUnwind = nullptr; // Per-function state EHPersonality Personality = EHPersonality::Unknown; Function *PersonalityFn = nullptr; bool UseStackGuard = false; int ParentBaseState; - Constant *SehLongjmpUnwind = nullptr; + FunctionCallee SehLongjmpUnwind = nullptr; Constant *Cookie = nullptr; /// The stack allocation containing all EH data, including the link in the @@ -304,7 +301,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { CxxLongjmpUnwind = TheModule->getOrInsertFunction( "__CxxLongjmpUnwind", FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false)); - cast<Function>(CxxLongjmpUnwind->stripPointerCasts()) + cast<Function>(CxxLongjmpUnwind.getCallee()->stripPointerCasts()) ->setCallingConv(CallingConv::X86_StdCall); } else if (Personality == EHPersonality::MSVC_X86SEH) { // If _except_handler4 is in use, some additional guard checks and prologue @@ -357,7 +354,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind", FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType, /*isVarArg=*/false)); - cast<Function>(SehLongjmpUnwind->stripPointerCasts()) + cast<Function>(SehLongjmpUnwind.getCallee()->stripPointerCasts()) ->setCallingConv(CallingConv::X86_StdCall); } else { llvm_unreachable("unexpected personality function"); @@ -412,7 +409,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); auto AI = Trampoline->arg_begin(); Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; - CallInst *Call = Builder.CreateCall(CastPersonality, Args); + CallInst *Call = Builder.CreateCall(TargetFuncTy, CastPersonality, Args); // Can't use musttail due to prototype mismatch, but we can use tail. Call->setTailCall(true); // Set inreg so we pass it in EAX. @@ -433,7 +430,7 @@ void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder, // Next = [fs:00] Constant *FSZero = Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); - Value *Next = Builder.CreateLoad(FSZero); + Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), FSZero); Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0)); // [fs:00] = Link Builder.CreateStore(Link, FSZero); @@ -448,8 +445,8 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { } Type *LinkTy = getEHLinkRegistrationType(); // [fs:00] = Link->Next - Value *Next = - Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0)); + Value *Next = Builder.CreateLoad(LinkTy->getPointerTo(), + Builder.CreateStructGEP(LinkTy, Link, 0)); Constant *FSZero = Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); Builder.CreateStore(Next, FSZero); @@ -472,11 +469,11 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, SmallVector<Value *, 3> OptionalArgs; if (Personality == EHPersonality::MSVC_CXX) { - OptionalArgs.push_back(CxxLongjmpUnwind); + OptionalArgs.push_back(CxxLongjmpUnwind.getCallee()); OptionalArgs.push_back(State); OptionalArgs.push_back(emitEHLSDA(Builder, &F)); } else if (Personality == EHPersonality::MSVC_X86SEH) { - OptionalArgs.push_back(SehLongjmpUnwind); + OptionalArgs.push_back(SehLongjmpUnwind.getCallee()); OptionalArgs.push_back(State); if (UseStackGuard) OptionalArgs.push_back(Cookie); @@ -767,7 +764,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { if (!CS) continue; if (CS.getCalledValue()->stripPointerCasts() != - SetJmp3->stripPointerCasts()) + SetJmp3.getCallee()->stripPointerCasts()) continue; SetJmp3CallSites.push_back(CS); @@ -782,9 +779,9 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { IRBuilder<> Builder(CS.getInstruction()); Value *State; if (InCleanup) { - Value *StateField = - Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex); - State = Builder.CreateLoad(StateField); + Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(), + RegNode, StateFieldIndex); + State = Builder.CreateLoad(Builder.getInt32Ty(), StateField); } else { State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS)); } @@ -794,7 +791,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) { IRBuilder<> Builder(IP); - Value *StateField = - Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex); + Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(), + RegNode, StateFieldIndex); Builder.CreateStore(Builder.getInt32(State), StateField); } |